├── tao_action_recognition ├── data_generation │ ├── resources │ │ ├── of_preprocess_pipe.png │ │ └── rgb_preprocess_pipe.png │ ├── jetson_of │ │ └── vpi │ │ │ ├── CMakeLists.txt │ │ │ └── main.cpp │ ├── convert_dataset.py │ ├── README.md │ ├── split_dataset.py │ ├── preprocess_HMDB_RGB.sh │ ├── generate_new_dataset_format.py │ ├── preprocess_HMDB.sh │ ├── preprocess_SHAD_RGB.sh │ ├── convert_of.py │ ├── preprocess_SHAD.sh │ └── save_tracks_shad.py ├── tensorrt_inference │ ├── trt_inference │ │ ├── __init__.py │ │ └── engine.py │ ├── README.md │ ├── ar_trt_inference.py │ └── ar_of_trt_inference.py ├── specs │ ├── i3d_rgb_3d_64_export.yaml │ └── train_rgb_3d_64_i3d.yaml └── doc │ └── load_I3D.md ├── README.md ├── tao_ocdr └── handwritten │ ├── specs │ ├── ocd │ │ ├── export.yaml │ │ ├── inference.yaml │ │ ├── evaluate.yaml │ │ └── train.yaml │ └── ocr │ │ └── experiment.yaml │ └── preprocess_data.py ├── tao_key_points_estimation └── tensorrt_inference │ ├── trt_inference │ ├── __init__.py │ └── engine.py │ ├── README.md │ └── fpenet_trt_inference.py ├── tao_pointpillars └── tensorrt_sample │ ├── include │ ├── postprocess.h │ └── pointpillar.h │ ├── README.md │ ├── test │ ├── CMakeLists.txt │ └── main.cpp │ ├── src │ ├── postprocess.cpp │ └── pointpillar.cpp │ └── LICENSE ├── tao_retinanet ├── README.md └── tao_retinanet_scales_aspect_ratio_estimate.py ├── tao_object_dection └── yolov4 │ ├── specs │ ├── classification_cspdarknet53.txt │ └── yolov4_416_coco14.txt │ └── README.md ├── tao_classification ├── mobilenet_v2 │ └── mobilenetv2_imagenet2012.txt └── deploy_to_deepstream │ └── README.md ├── LICENSE ├── tao_training_without_network └── Guide.md ├── tao_api └── how_to_modify_code_for_TAO_API.md └── tao_forum_faq └── FAQ.md /tao_action_recognition/data_generation/resources/of_preprocess_pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/tao_toolkit_recipes/HEAD/tao_action_recognition/data_generation/resources/of_preprocess_pipe.png -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/resources/rgb_preprocess_pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA-AI-IOT/tao_toolkit_recipes/HEAD/tao_action_recognition/data_generation/resources/rgb_preprocess_pipe.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reference samples of data generation and tensorrt inference for TAO Toolkit 2 | This repository provides reference samples of data generation and tensorrt inference for [TAO Toolkit](https://developer.nvidia.com/tao-toolkit) 3 | 4 | The supported task: 5 | 6 | - [Action Recognition](https://github.com/NVIDIA-AI-IOT/tao_toolkit_recipes/tree/main/tao_action_recognition) 7 | -------------------------------------------------------------------------------- /tao_ocdr/handwritten/specs/ocd/export.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | load_pruned_graph: False 3 | pruned_graph_path: '/results/prune/pruned_0.1.pth' 4 | 5 | export: 6 | results_dir: /results/ocd/export 7 | checkpoint: '/results/train/model_best.pth' 8 | onnx_file: '/results/export/model_best.onnx' 9 | width: 1024 10 | height: 1024 11 | 12 | dataset: 13 | validate_dataset: 14 | data_path: ['/data/ocdnet/iamdata/test'] 15 | -------------------------------------------------------------------------------- /tao_ocdr/handwritten/specs/ocd/inference.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | load_pruned_graph: false 3 | pruned_graph_path: '/results/prune/pruned_0.1.pth' 4 | 5 | inference: 6 | checkpoint: '/results/train/model_best.pth' 7 | input_folder: /data/ocdnet/iamdata/test/img 8 | width: 1024 9 | height: 1024 10 | img_mode: BGR 11 | polygon: false 12 | show: false 13 | results_dir: /results/inference 14 | 15 | post_processing: 16 | type: SegDetectorRepresenter 17 | args: 18 | thresh: 0.45 19 | box_thresh: 0.55 20 | max_candidates: 1000 21 | unclip_ratio: 1.5 22 | 23 | -------------------------------------------------------------------------------- /tao_key_points_estimation/tensorrt_inference/trt_inference/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .engine import * 17 | -------------------------------------------------------------------------------- /tao_key_points_estimation/tensorrt_inference/README.md: -------------------------------------------------------------------------------- 1 | # TensorRT inference sample for TAO key points estimation 2 | 3 | ## Introduction 4 | This is a TensorRT inference sample for TAO key points estimation. This sample will consume TensorRT engine and json format input generated in FPENet notebook. 5 | 6 | ## Prequisites 7 | `TensorRT`, `numpy`, `cv2` is needed for this sample. You can try TensorRT docker image on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) for easily building environment. 8 | 9 | 10 | ## Steps to run inference: 11 | 12 | ```sh 13 | # Generate TensorRT engine of fpenet model 14 | tao fpenet export -m -k -o --engine_file trt_fpenet.engine 15 | 16 | # run inference: 17 | python3 fpenet_trt_inference.py --input_json= --trt_engine= --output_img_dir= 18 | ``` -------------------------------------------------------------------------------- /tao_ocdr/handwritten/specs/ocd/evaluate.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | load_pruned_graph: False 3 | pruned_graph_path: '/results/prune/pruned_0.1.pth' 4 | 5 | evaluate: 6 | results_dir: /results/ocd/evaluate 7 | checkpoint: /results/train/model_best.pth 8 | gpu_id: 0 9 | post_processing: 10 | type: SegDetectorRepresenter 11 | args: 12 | thresh: 0.45 13 | box_thresh: 0.55 14 | max_candidates: 1000 15 | unclip_ratio: 1.5 16 | 17 | metric: 18 | type: QuadMetric 19 | args: 20 | is_output_polygon: false 21 | 22 | 23 | dataset: 24 | validate_dataset: 25 | data_path: ['/data/ocdnet/iamdata/test'] 26 | args: 27 | pre_processes: 28 | - type: Resize2D 29 | args: 30 | short_size: 31 | - 2464 32 | - 3520 33 | resize_text_polys: true 34 | img_mode: BGR 35 | filter_keys: [] 36 | ignore_tags: ['*', '###'] 37 | loader: 38 | batch_size: 1 39 | shuffle: false 40 | pin_memory: false 41 | num_workers: 4 42 | 43 | 44 | -------------------------------------------------------------------------------- /tao_action_recognition/tensorrt_inference/trt_inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | from .engine import * 22 | -------------------------------------------------------------------------------- /tao_pointpillars/tensorrt_sample/include/postprocess.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef POSTPROCESS_H_ 19 | #define POSTPROCESS_H_ 20 | 21 | #include 22 | 23 | struct Bndbox { 24 | float x; 25 | float y; 26 | float z; 27 | float w; 28 | float l; 29 | float h; 30 | float rt; 31 | int id; 32 | float score; 33 | Bndbox(){}; 34 | Bndbox(float x_, float y_, float z_, float l_, float w_, float h_, float rt_, int id_, float score_) 35 | : x(x_), y(y_), z(z_), w(w_), l(l_), h(h_), rt(rt_), id(id_), score(score_) {} 36 | }; 37 | 38 | int nms_cpu(std::vector bndboxes, const float nms_thresh, 39 | std::vector &nms_pred, const int pre_nms_top_n); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /tao_ocdr/handwritten/specs/ocr/experiment.yaml: -------------------------------------------------------------------------------- 1 | results_dir: /results 2 | encryption_key: nvidia_tao 3 | model: 4 | TPS: True 5 | backbone: ResNet 6 | feature_channel: 512 7 | sequence: BiLSTM 8 | hidden_size: 256 9 | prediction: CTC 10 | quantize: False 11 | input_width: 100 12 | input_height: 32 13 | input_channel: 1 14 | dataset: 15 | train_dataset_dir: [] 16 | val_dataset_dir: /data/test/lmdb 17 | character_list_file: /data/character_list 18 | max_label_length: 25 19 | batch_size: 32 20 | workers: 4 21 | augmentation: 22 | keep_aspect_ratio: False 23 | train: 24 | seed: 1111 25 | gpu_ids: [0] 26 | optim: 27 | name: "adadelta" 28 | lr: 0.1 29 | clip_grad_norm: 5.0 30 | num_epochs: 10 31 | checkpoint_interval: 2 32 | validation_interval: 1 33 | evaluate: 34 | gpu_id: 0 35 | checkpoint: "??" 36 | test_dataset_dir: "??" 37 | results_dir: "${results_dir}/evaluate" 38 | prune: 39 | gpu_id: 0 40 | checkpoint: "??" 41 | results_dir: "${results_dir}/prune" 42 | prune_setting: 43 | mode: experimental_hybrid 44 | amount: 0.4 45 | granularity: 8 46 | raw_prune_score: L1 47 | inference: 48 | gpu_id: 0 49 | checkpoint: "??" 50 | inference_dataset_dir: "??" 51 | results_dir: "${results_dir}/inference" 52 | export: 53 | gpu_id: 0 54 | checkpoint: "??" 55 | results_dir: "${results_dir}/export" 56 | dataset_convert: 57 | input_img_dir: "??" 58 | gt_file: "??" 59 | results_dir: "${results_dir}/convert_dataset" 60 | gen_trt_engine: 61 | onnx_file: "??" 62 | results_dir: "${results_dir}/convert_dataset" 63 | -------------------------------------------------------------------------------- /tao_retinanet/README.md: -------------------------------------------------------------------------------- 1 | # Sample to estimate best scales and aspect ratio values for TAO retinanet 2 | 3 | This is an experimental sample to estimate best scales and aspect ratio values for TAO retinanet: 4 | 5 | ``` 6 | retinanet_config { 7 | aspect_ratios_global: "[1.0, 2.0, 0.5]" 8 | scales: "[0.05, 0.2, 0.35, 0.5, 0.65, 0.8]" 9 | ``` 10 | 11 | Please do try more parameters for best model performance. 12 | 13 | 14 | ## Detailed Steps 15 | 16 | ### Download kitti dataset 17 | 18 | Assume link below has the txt label files: 19 | ``` 20 | /home/user/tlt-experiments/data/training/label_2/ 21 | ``` 22 | 23 | 24 | 25 | ### Prepare parameters for sample to estimate optimal values 26 | 27 | #### Change tao_retinanet_scales_aspect_ratio_estimate.py to point to correct folder for labels 28 | ``` 29 | folder="/home/user/tlt-experiments/data/training/label_2/" 30 | ``` 31 | 32 | 33 | #### Change tao_retinanet_scales_aspect_ratio_estimate.py to set shorten value of image width and image height 34 | ``` 35 | shorter_length_of_image = 375 36 | ``` 37 | 38 | 39 | ##### Change tao_retinanet_scales_aspect_ratio_estimate.py to remove outliers for aspect ratios 40 | ``` 41 | limit_max_ar=4 42 | ``` 43 | 44 | 45 | 46 | ### Run sample to estimate optimal values 47 | 48 | ``` 49 | python tao_retinanet_scales_aspect_ratio_estimate.py 50 | ``` 51 | 52 | 53 | 54 | ### Running log with kitti dataset 55 | 56 | 57 | ``` 58 | scales: 59 | [0.0691874 0.13098365 0.21473368 0.33218772 0.48606437 0.82403735] 60 | aspect ratios from algo 61 | [0.52653116 1.36425734 2.43270715] 62 | aspect ratios considering 1.0: 63 | [0.52653116 1. 2.43270715] 64 | ``` 65 | 66 | -------------------------------------------------------------------------------- /tao_action_recognition/tensorrt_inference/README.md: -------------------------------------------------------------------------------- 1 | # TensorRT inference sample for TAO ActionRecognitionNet 2 | 3 | ## Introduction 4 | This is a TensorRT inference sample with TAO ActionRecognitionNet deployable model. This sample will consume TensorRT engine and sequence of images and predict the people's action in those images. 5 | 6 | ## Prequisites 7 | `TensorRT`, `numpy`, `PIL` is needed for this sample. You can try TensorRT docker image on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) for easily building environment. 8 | 9 | You also need to download `tao-converter` from [TAO toolkit](https://developer.nvidia.com/tao-toolkit-get-started) to convert the encrypted tao model to TensorRT engine. 10 | 11 | ## Steps to run inference: 12 | 13 | ```sh 14 | # Download the deployable action recognition model from NGC 15 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/tao/actionrecognitionnet/versions/deployable_v1.0/zip -O actionrecognitionnet_deployable_v1.0.zip 16 | 17 | # Generate TensorRT engine of action recognition model 18 | # generate engine of 2D model: 19 | tao-converter resnet18_2d_rgb_hmdb5_32.etlt -k nvidia_tao -p input_rgb,1x96x224x224,1x96x224x224,1x96x224x224 -e trt2d.engine -t fp16 20 | # generate engine of 3D model: 21 | tao-converter resnet18_3d_rgb_hmdb5_32.etlt -k nvidia_tao -p input_rgb,1x3x32x224x224,1x3x32x224x224,1x3x32x224x224 -e trt3d.engine -t fp16 22 | 23 | # run inference: 24 | # run inference with 2D engine: 25 | python ar_trt_inference.py --input_images_folder=/path/to/images --trt_engine=./trt2d.engine --input_2d 26 | # run inference with 3D engine: 27 | python ar_trt_inference.py --input_images_folder=/path/to/images --trt_engine=./trt3d.engine 28 | ``` 29 | -------------------------------------------------------------------------------- /tao_object_dection/yolov4/specs/classification_cspdarknet53.txt: -------------------------------------------------------------------------------- 1 | model_config { 2 | # Model Architecture can be chosen from: 3 | # ['resnet', 'vgg', 'googlenet', 'alexnet'] 4 | arch: "cspdarknet" 5 | 6 | # for resnet --> n_layers can be [10, 18, 50] 7 | # for vgg --> n_layers can be [16, 19] 8 | n_layers: 53 9 | use_batch_norm: True 10 | use_bias: False 11 | use_imagenet_head: True 12 | all_projections: False 13 | use_pooling: True 14 | # if you want to use the pretrained model, 15 | # image size should be "3,224,224" 16 | # otherwise, it can be "3, X, Y", where X,Y >= 16 17 | input_image_size: "3,224,224" 18 | } 19 | train_config { 20 | train_dataset_path: "/workspace/tao-experiments/data/imagenet2012/train" 21 | val_dataset_path: "/workspace/tao-experiments/data/imagenet2012/val" 22 | # Only ['sgd', 'adam'] are supported for optimizer 23 | optimizer { 24 | sgd { 25 | lr: 0.01 26 | decay: 0.0 27 | momentum: 0.9 28 | nesterov: False 29 | } 30 | } 31 | preprocess_mode: "torch" 32 | enable_random_crop: True 33 | enable_center_crop: True 34 | label_smoothing: 0.0 35 | batch_size_per_gpu: 64 36 | n_epochs: 200 37 | mixup_alpha: 0.2 38 | 39 | # Number of CPU cores for loading data 40 | n_workers: 40 41 | 42 | # regularizer 43 | reg_config { 44 | # regularizer type can be "L1", "L2" or "None". 45 | type: "L2" 46 | # if the type is not "None", 47 | # scope can be either "Conv2D" or "Dense" or both. 48 | scope: "Conv2D,Dense" 49 | # 0 < weight decay < 1 50 | weight_decay: 0.00003 51 | } 52 | 53 | # learning_rate 54 | lr_config { 55 | cosine{ 56 | learning_rate: 0.05 57 | soft_start: 0.0 58 | min_lr_ratio: 0.001 59 | } 60 | } 61 | } 62 | 63 | -------------------------------------------------------------------------------- /tao_action_recognition/specs/i3d_rgb_3d_64_export.yaml: -------------------------------------------------------------------------------- 1 | output_file: /workspace/rgb_3d_hmdb/i3d_rgb3d_64.etlt 2 | model: /workspace/rgb_3d_hmdb/i3d_rgb3d_64.tlt 3 | batch_size: 1 4 | encryption_key: nvidia_tlt 5 | gpu_id: 0 6 | model_config: 7 | model_type: rgb 8 | input_type: "3d" 9 | backbone: i3d 10 | rgb_seq_length: 64 11 | sample_strategy: consecutive 12 | sample_rate: 1 13 | dataset_config: 14 | train_dataset_dir: /raid/HMDB51_splitted_org/train 15 | val_dataset_dir: /raid/HMDB51_splitted_org/test 16 | label_map: 17 | throw: 0 18 | push: 1 19 | dribble: 2 20 | shoot_gun: 3 21 | hug: 4 22 | smile: 5 23 | fall_floor: 6 24 | chew: 7 25 | turn: 8 26 | cartwheel: 9 27 | stand: 10 28 | draw_sword: 11 29 | drink: 12 30 | eat: 13 31 | talk: 14 32 | climb: 15 33 | smoke: 16 34 | pick: 17 35 | shake_hands: 18 36 | pushup: 19 37 | swing_baseball: 20 38 | somersault: 21 39 | walk: 22 40 | flic_flac: 23 41 | run: 24 42 | ride_horse: 25 43 | sit: 26 44 | kiss: 27 45 | situp: 28 46 | hit: 29 47 | handstand: 30 48 | climb_stairs: 31 49 | pour: 32 50 | shoot_bow: 33 51 | kick_ball: 34 52 | brush_hair: 35 53 | sword_exercise: 36 54 | dive: 37 55 | fencing: 38 56 | golf: 39 57 | sword: 40 58 | shoot_ball: 41 59 | clap: 42 60 | punch: 43 61 | catch: 44 62 | jump: 45 63 | kick: 46 64 | ride_bike: 47 65 | wave: 48 66 | laugh: 49 67 | pullup: 50 68 | output_shape: 69 | - 224 70 | - 224 71 | batch_size: 8 72 | workers: 8 73 | augmentation_config: 74 | train_crop_type: random_crop 75 | horizontal_flip_prob: 0.5 76 | rgb_input_mean: [0.485, 0.456, 0.406] 77 | rgb_input_std: [0.229, 0.224, 0.225] 78 | val_center_crop: True 79 | -------------------------------------------------------------------------------- /tao_classification/mobilenet_v2/mobilenetv2_imagenet2012.txt: -------------------------------------------------------------------------------- 1 | model_config { 2 | # Model Architecture can be chosen from: 3 | # ['resnet', 'vgg', 'googlenet', 'alexnet'] 4 | arch: "mobilenet_v2" 5 | # for resnet --> n_layers can be [10, 18, 50] 6 | # for vgg --> n_layers can be [16, 19] 7 | use_bias: False 8 | use_imagenet_head: True 9 | use_batch_norm: True 10 | resize_interpolation_method: BICUBIC 11 | # if you want to use the pretrained model, 12 | # image size should be "3,224,224" 13 | # otherwise, it can be "3, X, Y", where X,Y >= 16 14 | input_image_size: "3,224,224" 15 | } 16 | train_config { 17 | preprocess_mode: "tf" 18 | train_dataset_path: "/raid/ImageNet2012/ImageNet2012/train" 19 | val_dataset_path: "/raid/ImageNet2012/ImageNet2012/val" 20 | # Only ['sgd', 'adam'] are supported for optimizer 21 | optimizer { 22 | sgd { 23 | lr: 0.045 24 | decay: 0.0 25 | momentum: 0.9 26 | nesterov: False 27 | } 28 | } 29 | batch_size_per_gpu: 96 30 | n_epochs: 420 31 | # Number of CPU cores for loading data 32 | n_workers: 16 33 | # regularizer 34 | reg_config { 35 | # regularizer type can be "L1", "L2" or "None". 36 | type: "L2" 37 | # if the type is not "None", 38 | # scope can be either "Conv2D" or "Dense" or both. 39 | scope: "Conv2D,Dense" 40 | # 0 < weight decay < 1 41 | weight_decay: 5e-5 42 | } 43 | lr_config { 44 | cosine { 45 | learning_rate: 0.05 46 | min_lr_ratio: 0.001 47 | } 48 | } 49 | enable_random_crop: True 50 | enable_center_crop: True 51 | enable_color_augmentation: True 52 | mixup_alpha: 0.2 53 | label_smoothing: 0.1 54 | } 55 | eval_config { 56 | eval_dataset_path: "/raid/ImageNet2012/ImageNet2012/val" 57 | model_path: "/workspace/classification/mobilenet_v2/results/weights/mobilenet_v2_420.tlt" 58 | top_k: 1 59 | batch_size: 32 60 | n_workers: 8 61 | enable_center_crop: True 62 | } 63 | -------------------------------------------------------------------------------- /tao_pointpillars/tensorrt_sample/README.md: -------------------------------------------------------------------------------- 1 | # PointPillar TensorRT Inference Sample 2 | TensorRT Inference Sample for PointPillars in NVIDIA TAO Toolkit 3 | 4 | # PointPillars inference with TensorRT 5 | This repository provides an end-to-end inference sample for [PointPillars](https://arxiv.org/abs/1812.05784) with TensorRT. 6 | 7 | The input model is the TensorRT engine generated by NVIDIA TAO toolkit with `tao-converter`. 8 | 9 | ## Detailed Steps 10 | 11 | * Install TensorRT 8.2(or above) 12 | 13 | * Install TensorRT OSS 22.02 14 | ``` 15 | git clone -b 22.02 https://github.com/NVIDIA/TensorRT.git TensorRT 16 | cd TensorRT 17 | git submodule update --init --recursive 18 | mkdir -p build && cd build 19 | cmake .. -DCUDA_VERSION=$CUDA_VERSION -DGPU_ARCHS=$GPU_ARCHS 20 | make nvinfer_plugin -j$(nproc) 21 | make nvinfer_plugin_static -j$(nproc) 22 | cp libnvinfer_plugin.so.8.2.* /usr/lib/$ARCH-linux-gnu/libnvinfer_plugin.so.8.2.3 23 | cp libnvinfer_plugin_static.a /usr/lib/$ARCH-linux-gnu/libnvinfer_plugin_static.a 24 | ``` 25 | 26 | * Train and export the `.etlt` model with TAO Toolkit 27 | 28 | * Generate TensorRT engine with `tao-converter` 29 | 30 | ``` 31 | tao-converter -k $KEY \ 32 | -e $USER_EXPERIMENT_DIR/trt.fp16.engine \ 33 | -p points,1x204800x4,1x204800x4,1x204800x4 \ 34 | -p num_points,1,1,1 \ 35 | -t fp16 \ 36 | $USER_EXPERIMENT_DIR/pointpillars_deployable.etlt 37 | ``` 38 | 39 | * Clone the repo 40 | 41 | ``` 42 | cd ~ 43 | git clone https://github.com/NVIDIA-AI-IOT/tao_toolkit_recipes.git 44 | cd tao_toolkit_recipes 45 | git lfs pull 46 | ``` 47 | 48 | * Run the TensorRT Inference 49 | 50 | ``` 51 | cd tao_pointpillars/tensorrt_sample/test 52 | mkdir build 53 | cd build 54 | cmake .. -DCUDA_VERSION= 55 | make -j8 56 | ./pointpillars -e /path/to/tensorrt/engine -l ../../data/102.bin -t 0.01 -c Vehicle,Pedestrain,Cyclist -n 4096 -p -d fp16 57 | ``` 58 | -------------------------------------------------------------------------------- /tao_action_recognition/specs/train_rgb_3d_64_i3d.yaml: -------------------------------------------------------------------------------- 1 | output_dir: ./exp0_20__/rgb_3d_hmdb 2 | encryption_key: nvidia_tlt 3 | gpu_ids: [0, 1, 2, 3, 4, 5, 6, 7] 4 | model_config: 5 | model_type: rgb 6 | input_type: "3d" 7 | backbone: i3d 8 | rgb_seq_length: 64 9 | rgb_pretrained_model_path: /workspace/action_recognition/i3d_pretrained/rgb_imagenet_kinetics.pt 10 | rgb_pretrained_num_classes: 400 11 | sample_strategy: consecutive 12 | sample_rate: 1 13 | train_config: 14 | optim: 15 | lr: 0.01 16 | momentum: 0.9 17 | weight_decay: 0.0000001 18 | lr_steps: [12, 25] 19 | lr_decay: 0.1 20 | epochs: 35 21 | dataset_config: 22 | train_dataset_dir: /raid/HMDB51_splitted_org/train 23 | val_dataset_dir: /raid/HMDB51_splitted_org/test 24 | label_map: 25 | throw: 0 26 | push: 1 27 | dribble: 2 28 | shoot_gun: 3 29 | hug: 4 30 | smile: 5 31 | fall_floor: 6 32 | chew: 7 33 | turn: 8 34 | cartwheel: 9 35 | stand: 10 36 | draw_sword: 11 37 | drink: 12 38 | eat: 13 39 | talk: 14 40 | climb: 15 41 | smoke: 16 42 | pick: 17 43 | shake_hands: 18 44 | pushup: 19 45 | swing_baseball: 20 46 | somersault: 21 47 | walk: 22 48 | flic_flac: 23 49 | run: 24 50 | ride_horse: 25 51 | sit: 26 52 | kiss: 27 53 | situp: 28 54 | hit: 29 55 | handstand: 30 56 | climb_stairs: 31 57 | pour: 32 58 | shoot_bow: 33 59 | kick_ball: 34 60 | brush_hair: 35 61 | sword_exercise: 36 62 | dive: 37 63 | fencing: 38 64 | golf: 39 65 | sword: 40 66 | shoot_ball: 41 67 | clap: 42 68 | punch: 43 69 | catch: 44 70 | jump: 45 71 | kick: 46 72 | ride_bike: 47 73 | wave: 48 74 | laugh: 49 75 | pullup: 50 76 | output_shape: 77 | - 224 78 | - 224 79 | batch_size: 8 80 | workers: 8 81 | augmentation_config: 82 | train_crop_type: random_crop 83 | horizontal_flip_prob: 0.5 84 | rgb_input_mean: [0.485, 0.456, 0.406] 85 | rgb_input_std: [0.229, 0.224, 0.225] 86 | val_center_crop: True 87 | -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/jetson_of/vpi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | cmake_minimum_required(VERSION 3.5) 28 | 29 | project(jetson_optflow_flow) 30 | 31 | set(CMAKE_CXX_STANDARD 11) 32 | 33 | find_package(vpi 1.1 REQUIRED) 34 | find_package(OpenCV REQUIRED) 35 | 36 | add_executable(${PROJECT_NAME} main.cpp) 37 | target_link_libraries(${PROJECT_NAME} vpi opencv_core 38 | opencv_imgproc) 39 | 40 | if(OpenCV_VERSION VERSION_LESS 3) 41 | target_link_libraries(${PROJECT_NAME} opencv_highgui) 42 | else() 43 | target_link_libraries(${PROJECT_NAME} opencv_imgcodecs opencv_videoio) 44 | endif() 45 | -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/convert_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import argparse 22 | import os 23 | import cv2 24 | 25 | 26 | def clip_video(input_video_path, output_path): 27 | cap = cv2.VideoCapture(input_video_path) 28 | frame_cnt = cap.get(cv2.CAP_PROP_FRAME_COUNT) 29 | print("f cnt: {}".format(frame_cnt)) 30 | height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) 31 | width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) 32 | 33 | img_id = 1 34 | while cap.isOpened(): 35 | ret, frame = cap.read() 36 | img_name = os.path.join(output_path, str(img_id).zfill(6)+".png") 37 | if ret: 38 | cv2.imwrite(img_name, frame) 39 | else: 40 | break 41 | img_id += 1 42 | 43 | 44 | if __name__ == "__main__": 45 | 46 | parser = argparse.ArgumentParser(description='Clip video to RGB frames') 47 | parser.add_argument('--input_video', type=str, help='input video path') 48 | parser.add_argument('--output_folder', type=str, help='output images path') 49 | args = parser.parse_args() 50 | clip_video(args.input_video, args.output_folder) 51 | -------------------------------------------------------------------------------- /tao_ocdr/handwritten/specs/ocd/train.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | 3 | model: 4 | load_pruned_graph: False 5 | pruned_graph_path: '/results/prune/pruned_0.1.pth' 6 | pretrained_model_path: '/data/ocdnet/ocdnet_deformable_resnet18.pth' 7 | backbone: deformable_resnet18 8 | 9 | train: 10 | results_dir: /results/ocd/train 11 | num_epochs: 300 12 | #resume_training_checkpoint_path: '/results/train/resume.pth' 13 | checkpoint_interval: 1 14 | validation_interval: 1 15 | trainer: 16 | clip_grad_norm: 5.0 17 | 18 | optimizer: 19 | type: Adam 20 | args: 21 | lr: 0.001 22 | 23 | lr_scheduler: 24 | type: WarmupPolyLR 25 | args: 26 | warmup_epoch: 3 27 | 28 | post_processing: 29 | type: SegDetectorRepresenter 30 | args: 31 | thresh: 0.45 32 | box_thresh: 0.55 33 | max_candidates: 1000 34 | unclip_ratio: 1.5 35 | 36 | metric: 37 | type: QuadMetric 38 | args: 39 | is_output_polygon: false 40 | 41 | 42 | dataset: 43 | train_dataset: 44 | data_path: ['/data/ocdnet/iamdata/train'] 45 | args: 46 | pre_processes: 47 | - type: IaaAugment 48 | args: 49 | - {'type':Fliplr, 'args':{'p':0.5}} 50 | - {'type': Affine, 'args':{'rotate':[-10,10]}} 51 | - {'type':Resize,'args':{'size':[0.5,3]}} 52 | - type: EastRandomCropData 53 | args: 54 | size: [1024,1024] 55 | max_tries: 50 56 | keep_ratio: true 57 | - type: MakeBorderMap 58 | args: 59 | shrink_ratio: 0.4 60 | thresh_min: 0.3 61 | thresh_max: 0.7 62 | - type: MakeShrinkMap 63 | args: 64 | shrink_ratio: 0.4 65 | min_text_size: 8 66 | 67 | img_mode: BGR 68 | filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] 69 | ignore_tags: ['*', '###'] 70 | loader: 71 | batch_size: 4 72 | pin_memory: true 73 | num_workers: 12 74 | 75 | validate_dataset: 76 | data_path: ['/data/ocdnet/iamdata/test'] 77 | args: 78 | pre_processes: 79 | - type: Resize2D 80 | args: 81 | short_size: 82 | - 2464 83 | - 3520 84 | resize_text_polys: true 85 | img_mode: BGR 86 | filter_keys: [] 87 | ignore_tags: ['*', '###'] 88 | loader: 89 | batch_size: 1 90 | pin_memory: false 91 | num_workers: 4 92 | 93 | -------------------------------------------------------------------------------- /tao_pointpillars/tensorrt_sample/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | cmake_minimum_required(VERSION 2.8.7) 17 | set(PROJECT_NAME pointpillars) 18 | EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCH ) 19 | message( STATUS "Architecture: ${ARCH}" ) 20 | 21 | find_package(CUDA REQUIRED) 22 | 23 | set(CUDA_VERSION 11.3) 24 | set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-${CUDA_VERSION}) 25 | 26 | SET(CMAKE_BUILD_TYPE "Release") 27 | add_compile_options(-W) 28 | add_compile_options(-std=c++11) 29 | 30 | set(SMS 50 52 53 60 61 62 70 72 75 80 86) 31 | foreach(sm ${SMS}) 32 | set(GENCODE ${GENCODE} -gencode arch=compute_${sm},code=sm_${sm}) 33 | endforeach() 34 | list(GET SMS -1 LATEST_SM) 35 | set(GENCODE "${GENCODE} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}") 36 | 37 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} 38 | -ccbin ${CMAKE_CXX_COMPILER} 39 | -Xcompiler -DWIN_INTERFACE_CUSTOM 40 | -Xcompiler -I/usr/${ARCH}-linux-gnu/include/ 41 | -Xlinker -lsocket 42 | -Xlinker -rpath=/usr/lib/${ARCH}-linux-gnu/ 43 | -Xlinker -rpath=/usr/${ARCH}-linux-gnu/lib/ 44 | -Xlinker -L/usr/lib/${ARCH}-linux-gnu/ 45 | -Xlinker -L/usr/${ARCH}-linux-gnu/lib/ 46 | ) 47 | 48 | set(TENSORRT_INCLUDE_DIRS /usr/include/${ARCH}-linux-gnu/) 49 | set(TENSORRT_LIBRARY_DIRS /usr/lib/${ARCH}-linux-gnu/) 50 | 51 | include_directories( 52 | ${CUDA_INCLUDE_DIRS} 53 | ${TENSORRT_INCLUDE_DIRS} 54 | ../include/ 55 | ) 56 | 57 | link_directories( 58 | ${TENSORRT_LIBRARY_DIRS} 59 | /usr/lib/${ARCH}-linux-gnu 60 | /usr/${ARCH}-linux-gnu/lib/ 61 | ) 62 | 63 | file(GLOB_RECURSE SOURCE_FILES 64 | ../src/*.cu 65 | ../src/*.cpp 66 | ) 67 | 68 | cuda_add_executable(${PROJECT_NAME} main.cpp ${SOURCE_FILES}) 69 | 70 | target_link_libraries(${PROJECT_NAME} 71 | libnvinfer.so 72 | libnvonnxparser.so 73 | libnvinfer_plugin.so 74 | ) 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | 24 | VPI dense optical flow sample 25 | 26 | Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 27 | 28 | Redistribution and use in source and binary forms, with or without 29 | modification, are permitted provided that the following conditions 30 | are met: 31 | * Redistributions of source code must retain the above copyright 32 | notice, this list of conditions and the following disclaimer. 33 | * Redistributions in binary form must reproduce the above copyright 34 | notice, this list of conditions and the following disclaimer in the 35 | documentation and/or other materials provided with the distribution. 36 | * Neither the name of NVIDIA CORPORATION nor the names of its 37 | contributors may be used to endorse or promote products derived 38 | from this software without specific prior written permission. 39 | 40 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 41 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 43 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 44 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 45 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 46 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 47 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 48 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 49 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 50 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 51 | -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/README.md: -------------------------------------------------------------------------------- 1 | # Data generation sample for TAO ActionRecognitionNet 2 | 3 | ## Introduction 4 | This projects contains the sample scripts to generate dataset to proper format used by TAO ActionRecognitionNet 5 | 6 | - `convert_dataset.py` : Convert the video to RGB frames. 7 | - `convert_of.py` : Convert the optical flow vectors to grayscale images. 8 | - `split_dataset.py` : Script to split the HMDB51 dataset. 9 | - `load_tracks.py` / `save_tracks_shad.py` : Scripts to process SHAD dataset's annotation 10 | 11 | 12 | ## Prequisites 13 | - xmltodict 14 | - cv2 15 | 16 | ``` 17 | pip install xmltodict opencv-python 18 | ``` 19 | 20 | And we use the sample application `AppOFCuda` in Nvidia optical flow [SDK](https://developer.nvidia.com/opticalflow-sdk) to generate optical flow of frames. You could get this app by compiling by yourself or download the compiled binary in on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tao/resources/cv_samples/version) (It is packaged with action recognition notebook). 21 | 22 | ## Steps to generate dataset for TAO ActionRecognitionNet 23 | We provide 3 all_in_one scripts: 24 | 25 | - `preprocess_HMDB_RGB.sh`: Generate RGB dataset of HMDB51 26 | - `preprocess_SHAD_RGB.sh`: Generate RGB dataset of SHAD 27 | - `preprocess_SHAD.sh`: Generate RGB+OF dataset of SHAD 28 | 29 | ### SHAD dataset 30 | 31 | Dataset [URL](https://best.sjtu.edu.cn/Data/View/990) 32 | 33 | ```sh 34 | # make directory to contain 35 | mkdir -p train_raw 36 | 37 | # Download the dataset you need and unrar: 38 | wget -P ./ https://best.sjtu.edu.cn/Assets/userfiles/sys_eb538c1c-65ff-4e82-8e6a-a1ef01127fed/files/ZIP/Bend-train.rar 39 | unrar x Bend-train.rar train_raw 40 | ... 41 | 42 | # Generate RGB dataset with all_in_one script: 43 | ./preprocess_SHAD_RGB.sh train_raw train 44 | # Or you can generate RGB+OF dataset: 45 | # ./preprocess_SHAD.sh train_raw train 46 | 47 | ``` 48 | 49 | ### HMDB51 dataset 50 | 51 | Dataset [URL](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) 52 | 53 | ```sh 54 | # download the dataset and unrar: 55 | wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar 56 | unrar x hmdb51_org.rar video_rar 57 | 58 | # unrar the videos packages: 59 | unrar x ./video_rar/climb.rar ./HMDB51_videos/ 60 | unrar x ./video_rar/run.rar ./HMDB51_videos/ 61 | ... 62 | 63 | # run all_in_one script: 64 | ./preprocess_HMDB_RGB.sh ./HMDB51_videos ./HMDB51 65 | 66 | # split the dataset if needed: 67 | # python split_dataset.py 68 | 69 | ``` 70 | 71 | ### Common data process pipeline: 72 | The data process pipeline in above scripts can be concluded in following diagrams: 73 | - For RGB-only model: 74 | ![rgb_only_pipe](resources/rgb_preprocess_pipe.png) 75 | - For OF-only model: 76 | ![of_only_pipe](resources/of_preprocess_pipe.png) -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/split_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import os 22 | import shutil 23 | import sys 24 | 25 | root_path = sys.argv[1] 26 | split_files_path = sys.argv[2] 27 | target_train_path = sys.argv[3] 28 | target_test_path = sys.argv[4] 29 | 30 | if not os.path.exists(target_train_path): 31 | os.makedirs(target_train_path) 32 | if not os.path.exists(target_test_path): 33 | os.makedirs(target_test_path) 34 | 35 | train_cnt = 0 36 | test_cnt = 0 37 | for class_name in os.listdir(root_path): 38 | split_files = os.path.join(split_files_path, class_name + "_test_split1.txt") 39 | cls_train_path = os.path.join(target_train_path, class_name) 40 | cls_test_path = os.path.join(target_test_path, class_name) 41 | if not os.path.exists(cls_train_path): 42 | os.makedirs(cls_train_path) 43 | if not os.path.exists(cls_test_path): 44 | os.makedirs(cls_test_path) 45 | 46 | with open(split_files, "r") as f: 47 | split_list = f.readlines() 48 | 49 | for line in split_list: 50 | video_name, label = line.split() 51 | video_name = video_name.split(".")[0] 52 | cur_path = os.path.join(root_path, class_name, video_name) 53 | if int(label) == 1: 54 | train_cnt += 1 55 | des_path = os.path.join(target_train_path, class_name, video_name) 56 | shutil.move(cur_path, des_path) 57 | elif int(label) == 2: 58 | test_cnt += 1 59 | des_path = os.path.join(target_test_path, class_name, video_name) 60 | shutil.move(cur_path, des_path) 61 | 62 | 63 | print("Split 1: \n Train: {}\n Test: {}".format(train_cnt, test_cnt)) 64 | -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/preprocess_HMDB_RGB.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | MKDIR(){ 22 | if [ ! -d $1 ]; then 23 | mkdir -p $1 24 | fi 25 | } 26 | 27 | WORKER_CNT=4 28 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 29 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 30 | 31 | 32 | RUN_WORKERS(){ 33 | for((i=0;i<$WORKER_CNT;i++)); do 34 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 35 | python3 ./convert_dataset.py --input_video ${VIDEO_LIST[i]} --output_folder ${RGB_PATH_LIST[i]} 36 | fi 37 | done 38 | wait 39 | } 40 | 41 | if [ $# -ne 2 ]; then 42 | echo "USAGE:./preprocess_HMDB_RGB.sh [hmdb_dir] [output_top_dir]" 43 | exit 1 44 | else 45 | HMDB_TOP_DIR=$1 46 | OUTPUT_TOP_DIR=$2 47 | echo $HMDB_TOP_DIR 48 | echo $OUTPUT_TOP_DIR 49 | #TEMP_DIR="./tmp" 50 | #MKDIR $TEMP_DIR 51 | MKDIR $OUTPUT_TOP_DIR 52 | fi 53 | 54 | # 1st stage: unrar rar package: 55 | # for class in $HMDB_TOP_DIR/*; do 56 | # unrar x $class $TEMP_DIR > /dev/null & 57 | # done 58 | 59 | # 2nd stage: Clip video and generate optical flow out of it 60 | for class in $HMDB_TOP_DIR/*; do 61 | CLASS_NAME=$(echo $(basename $class) | cut -d . -f1) 62 | echo "Preprocess $CLASS_NAME" 63 | cnt=0 64 | # extract the frames 65 | for video in $HMDB_TOP_DIR/$CLASS_NAME/*; do 66 | VIDEO_NAME=$(echo $(basename $video) | cut -d . -f1) 67 | RGB_PATH=$OUTPUT_TOP_DIR/$CLASS_NAME/$VIDEO_NAME/"rgb" 68 | MKDIR $RGB_PATH 69 | VIDEO_LIST[$cnt]=$video 70 | RGB_PATH_LIST[$cnt]=$RGB_PATH 71 | 72 | cnt=$((cnt + 1)) 73 | if [ $cnt -eq $WORKER_CNT ]; then 74 | cnt=0 75 | RUN_WORKERS 76 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 77 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 78 | fi 79 | done 80 | if [ $cnt -ne 0 ]; then 81 | RUN_WORKERS 82 | fi 83 | done 84 | 85 | # rm -r $TEMP_DIR 86 | -------------------------------------------------------------------------------- /tao_action_recognition/doc/load_I3D.md: -------------------------------------------------------------------------------- 1 | # Load I3D Kinetics pretrained weights in TAO and finetune on HMDB51 2 | 3 | I3D is a 3D inception architecture proposed in paper *Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset*. In this paper, the authors show us the enormous benefit of pretrained weights on Kinetics400 of I3D architecture for the downstream dataset --- We can get much higher accuracy on other action recognition datasets with Kinetics pretrained weights: 4 | 5 | |Model type|Dataset|Pretrained|Acc| 6 | |:---:|:---:|:---:|:---:| 7 | |I3D RGB-Only|HMDB51|ImageNet|49.8%| 8 | |I3D OF-Only|HMDB51|ImageNet|61.9%| 9 | |I3D RGB-Only|HMDB51|Kinetics|74.3%| 10 | |I3D OF-Only|HMDB51|Kinetics|77.3%| 11 | 12 | In TAO Toolkit, we support to use I3D architecture for action recognition and it could alos load the pytorch version of Kinect400 pretrained I3D model to help improve the accuracy of the downstream dataset. 13 | 14 | ## Load I3D Kinetics pretrained weights and finetune on HMDB51 15 | 16 | The I3D architecture in TAO Toolkit is following the public [repo](https://github.com/piergiaj/pytorch-i3d). And this repo also contains the [RGB](https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_imagenet.pt) and the [Optical flow](https://github.com/piergiaj/pytorch-i3d/blob/master/models/flow_imagenet.pt) pretrained weights converted from DeepMind. 17 | 18 | To load these models, some config options should be set. Take RGB models as an example, the following are the `model_config` in the training config yaml file to load pretrained I3D RGB pretrained weights. 19 | 20 | ```yaml 21 | model_config: 22 | model_type: rgb 23 | input_type: 3d 24 | backbone: i3d 25 | rgb_seq_length: 64 26 | rgb_pretrained_model_path: /workspace/action_recognition/i3d_pretrained/rgb_imagenet_kinetics.pt 27 | rgb_pretrained_num_classes: 400 28 | ``` 29 | 30 | In the above config, the `backbone` is set to `i3d`, `rgb_pretrained_model_path` is set to the path of pretrained pytorch weights and the `rgb_pretrained_num_classes` is set to 400 to match with Kinetics-400 classes. 31 | 32 | We provide the [spec](https://github.com/NVIDIA-AI-IOT/tao_toolkit_recipes/blob/main/tao_action_recognition/specs/train_rgb_3d_64_i3d.yaml) to finetune I3D model on HMDB51 dataset. You might get ~75% accuracy after the training with following command. 33 | 34 | ```shell 35 | tao action_recognition train -e /path/to/train_rgb_3d_64_i3d.yaml -k your_key -r /path/to/results 36 | ``` 37 | 38 | ## Export the I3D model 39 | The exported I3D model could be consumed by TensorRT 8.2.3 and above. We provide the [spec](https://github.com/NVIDIA-AI-IOT/tao_toolkit_recipes/blob/main/tao_action_recognition/specs/i3d_rgb_3d_64_export.yaml) to export TAO Toolkit trained I3D model. And you could use the following command to export the model to etlt format: 40 | 41 | ```shell 42 | tao action_recognition export -k your_key -e /path/to/i3d_rgb_3d_64_export.yaml 43 | ``` 44 | 45 | ## Reference 46 | - [I3D models trained on Kinetics - pytorch version](https://github.com/piergiaj/pytorch-i3d) 47 | - [I3D models trained on Kinetics](https://github.com/piergiaj/pytorch-i3d) -------------------------------------------------------------------------------- /tao_pointpillars/tensorrt_sample/include/pointpillar.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef POINTPILLAR_H_ 19 | #define POINTPILLAR_H_ 20 | 21 | #include 22 | #include "cuda_runtime.h" 23 | #include "NvInfer.h" 24 | #include "NvOnnxConfig.h" 25 | #include "NvOnnxParser.h" 26 | #include "NvInferRuntime.h" 27 | #include "postprocess.h" 28 | 29 | #define PERFORMANCE_LOG 1 30 | 31 | // Logger for TensorRT 32 | class Logger : public nvinfer1::ILogger { 33 | public: 34 | void log(Severity severity, const char* msg) noexcept override { 35 | // suppress info-level message 36 | //if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR || severity == Severity::kINFO ) { 37 | if (severity == Severity::kERROR || severity == Severity::kINTERNAL_ERROR) { 38 | std::cerr << "trt_infer: " << msg << std::endl; 39 | } 40 | } 41 | }; 42 | 43 | class TRT { 44 | private: 45 | cudaEvent_t start, stop; 46 | 47 | float elapsedTime = 0.0f; 48 | Logger gLogger_; 49 | nvinfer1::IExecutionContext *context = nullptr; 50 | nvinfer1::ICudaEngine *engine = nullptr; 51 | 52 | cudaStream_t stream_; 53 | public: 54 | TRT( 55 | std::string modelFile, 56 | std::string engineFile, 57 | cudaStream_t stream, 58 | const std::string& data_type 59 | ); 60 | ~TRT(void); 61 | 62 | int doinfer(void**buffers, bool do_profile); 63 | nvinfer1::Dims get_binding_shape(int index); 64 | int getPointSize(); 65 | }; 66 | 67 | class PointPillar { 68 | private: 69 | cudaEvent_t start, stop; 70 | float elapsedTime = 0.0f; 71 | cudaStream_t stream_; 72 | //output of TRT 73 | std::shared_ptr trt_; 74 | //output of TRT 75 | float *box_output = nullptr; 76 | int *box_num = nullptr; 77 | unsigned int box_size; 78 | std::vector res; 79 | 80 | public: 81 | PointPillar( 82 | std::string modelFile, 83 | std::string engineFile, 84 | cudaStream_t stream, 85 | const std::string& data_type 86 | ); 87 | ~PointPillar(void); 88 | int getPointSize(); 89 | int doinfer( 90 | void*points_data, 91 | unsigned int* points_size, 92 | std::vector &nms_pred, 93 | float nms_iou_thresh, 94 | int pre_nms_top_n, 95 | std::vector& class_names, 96 | bool do_profile 97 | ); 98 | }; 99 | 100 | #endif -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/generate_new_dataset_format.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | import os 21 | import sys 22 | 23 | root_dir = sys.argv[1] 24 | target_dir = sys.argv[2] 25 | 26 | for class_name in os.listdir(root_dir): 27 | root_class_path = os.path.join(root_dir, class_name) 28 | target_class_path = os.path.join(target_dir, class_name) 29 | if not os.path.exists(target_class_path): 30 | os.makedirs(target_class_path) 31 | for video_name in os.listdir(root_class_path): 32 | video_path = os.path.join(root_class_path, video_name) 33 | target_video_path = os.path.join(target_class_path, video_name) 34 | target_rgb_path = os.path.join(target_video_path, "rgb") 35 | target_u_path = os.path.join(target_video_path, "u") 36 | target_v_path = os.path.join(target_video_path, "v") 37 | 38 | if not os.path.exists(target_rgb_path): 39 | os.makedirs(target_rgb_path) 40 | if not os.path.exists(target_u_path): 41 | os.makedirs(target_u_path) 42 | if not os.path.exists(target_v_path): 43 | os.makedirs(target_v_path) 44 | 45 | img_idx = 0 46 | for video_clip_name in sorted(os.listdir(video_path)): 47 | video_clip_path = os.path.join(video_path, video_clip_name) 48 | rgb_path = os.path.join(video_clip_path, "rgb") 49 | u_path = os.path.join(video_clip_path, "u") 50 | v_path = os.path.join(video_clip_path, "v") 51 | 52 | assert len(os.listdir(u_path)) == \ 53 | len(os.listdir(v_path)), "video clip mismatch. {}".format(video_clip_path) 54 | 55 | for file_name in sorted(os.listdir(rgb_path)): 56 | ext = file_name.split(".")[-1] 57 | rgb_file = os.path.join(rgb_path, file_name) 58 | u_file = os.path.join(u_path, file_name) 59 | v_file = os.path.join(v_path, file_name) 60 | 61 | target_file_name = str(img_idx).zfill(6) + "." + ext 62 | img_idx += 1 63 | target_rgb_file = os.path.join(target_rgb_path, target_file_name) 64 | target_u_file = os.path.join(target_u_path, target_file_name) 65 | target_v_file = os.path.join(target_v_path, target_file_name) 66 | 67 | os.rename(rgb_file, target_rgb_file) 68 | if os.path.exists(u_file): 69 | os.rename(u_file, target_u_file) 70 | if os.path.exists(v_file): 71 | os.rename(v_file, target_v_file) 72 | 73 | -------------------------------------------------------------------------------- /tao_retinanet/tao_retinanet_scales_aspect_ratio_estimate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import os 22 | import numpy as np 23 | from sklearn.cluster import KMeans 24 | 25 | num_scales_retinanet=6 26 | num_ars_retinanet=3 27 | limit_max_ar=4 28 | 29 | #file 004156.jpg 30 | # 004156.jpg: JPEG image data, JFIF standard 1.01, aspect ratio, density 1x1, segment length 16, baseline, precision 8, 1242x375, frames 3 31 | shorter_length_of_image = 375 32 | 33 | folder="/home/luwu/tlt-experiments/data/training/label_2/" 34 | widths=[] 35 | heights=[] 36 | files=[] 37 | 38 | for r, d, f in os.walk(folder): 39 | for file in f: 40 | if file.endswith(".txt"): 41 | file1 = open(folder+file, 'r') 42 | lines=file1.readlines() 43 | 44 | for line in lines: 45 | line_split=line.split(" ") 46 | cls=line_split[0] 47 | xl=float(line_split[4]) 48 | yl=float(line_split[5]) 49 | xr=float(line_split[6]) 50 | yr=float(line_split[7]) 51 | 52 | width=xr-xl 53 | height=yr-yl 54 | 55 | if cls != 'DontCare' and width>=0 and height >= 0: 56 | widths.append(width) 57 | heights.append(height) 58 | files.append(file1) 59 | file1.close() 60 | 61 | scales=[] 62 | aspect_ratios=[] 63 | for i in range(len(widths)): 64 | w=widths[i] 65 | h=heights[i] 66 | if w new_azure:version_1 82 | 83 | $ docker save -o new_azure_version_1.tar.gz new_azure:version_1 84 | 85 | $ docker save -o tao-toolkit-tf-v3.21.11-tf1.15.5-py3.tar.gz nvcr.io/nvidia/tao/tao-toolkit-tf:v3.21.11-tf1.15.5-py3 86 | 87 | $ docker save -o tao-toolkit-tf-v3.21.11-tf1.15.4-py3.tar.gz nvcr.io/nvidia/tao/tao-toolkit-tf:v3.21.11-tf1.15.4-py3 88 | 89 | $ docker save -o tao-toolkit-pyt-v3.21.11-py3.tar.gz nvcr.io/nvidia/tao/tao-toolkit-pyt:v3.21.11-py3 90 | 91 | $ docker save -o tao-toolkit-lm-v3.21.08-py3.tar.gz nvcr.io/nvidia/tao/tao-toolkit-lm:v3.21.08-py3 92 | ``` 93 | 94 | 95 | 96 | 97 | Copy all the tar.gz files into the 2nd machine which has no internet. 98 | ``` 99 | $ docker load -i new_azure_version_1.tar.gz 100 | 101 | $ docker load -i tao-toolkit-tf-v3.21.11-tf1.15.4-py3.tar.gz 102 | 103 | $ docker load -i tao-toolkit-tf-v3.21.11-tf1.15.5-py3.tar.gz 104 | 105 | $ docker load -i tao-toolkit-pyt-v3.21.11-py3.tar.gz 106 | 107 | $ docker load -i tao-toolkit-lm-v3.21.08-py3.tar.gz 108 | ``` 109 | 110 | Copy the training dataset into below path of the 2nd machine. If the path is not available, please generate the same as the 1st machine. 111 | `/home/username/` 112 | 113 | 114 | 115 | In the 2nd machine, login the new azure docker 116 | ``` 117 | $ docker run -it --rm -v /var/run/docker.sock:/var/run/docker.sock new_azure:version_1 /bin/bash 118 | ``` 119 | 120 | Then run training. 121 | -------------------------------------------------------------------------------- /tao_ocdr/handwritten/preprocess_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import cv2 5 | import argparse 6 | from tqdm import tqdm 7 | 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser("preprocess_data", add_help=True, description="Preprocess IAMDATA to TAO Toolkit OCRNet format") 11 | parser.add_argument( 12 | "--images_dir", 13 | help="Path to original images", 14 | default=None, 15 | required=True, 16 | ) 17 | parser.add_argument( 18 | "--labels_dir", 19 | help="Path to original label txt files", 20 | default=None, 21 | required=True, 22 | ) 23 | parser.add_argument( 24 | "--output_images_dir", 25 | help="Path to pre-processed images", 26 | default=None, 27 | required=True, 28 | ) 29 | parser.add_argument( 30 | "--gt_file_path", 31 | help="Path to ground truth list", 32 | default=None, 33 | required=True, 34 | ) 35 | parser.add_argument( 36 | "--character_list_path", 37 | help="Path to character list", 38 | default=None, 39 | required=True, 40 | ) 41 | 42 | args, _ = parser.parse_known_args() 43 | root_dir = args.images_dir 44 | gt_file_dir = args.labels_dir 45 | target_dir = args.output_images_dir 46 | 47 | if not os.path.exists(target_dir): 48 | os.makedirs(target_dir) 49 | 50 | p_gt_file = open(args.gt_file_path, "w") 51 | 52 | gt_file_list = os.listdir(gt_file_dir) 53 | character_set = set() 54 | 55 | for gt_file_name in tqdm(gt_file_list): 56 | img_id = gt_file_name.split(".")[0].replace("gt_", "") 57 | f = open(os.path.join(gt_file_dir, gt_file_name), "r") 58 | reader = f.readlines() 59 | 60 | img_path = os.path.join(root_dir, img_id+".png") 61 | img = cv2.imread(img_path) 62 | height, width, _ = img.shape 63 | for idx, ann in enumerate(reader): 64 | ann = ann.split(",") 65 | vs = ann[:8] 66 | text = ann[8:] 67 | if len(text) == 1: 68 | text = text[0].strip() 69 | if text.count("\"") == 4: 70 | text = "\"" 71 | elif len(text) == 2: 72 | text = "," 73 | else: 74 | # for label like: "163,000,000" 75 | # ignore the " " at the begin and end 76 | text = ",".join(text) 77 | text = text.replace("\"", "") 78 | text = text.strip() 79 | 80 | # Skip the words which length > 25 or non-word-level label 81 | if len(text) > 25 or (" " in text): 82 | continue 83 | # Lower-case: 84 | text = text.lower() 85 | 86 | for c in text: 87 | character_set.add(c) 88 | 89 | xs = [int(vs[idx]) for idx in range(0, len(vs), 2)] 90 | ys = [int(vs[idx]) for idx in range(1, len(vs), 2)] 91 | xmin = max(0, min(xs)) 92 | ymin = max(0, min(ys)) 93 | xmax = min(width, max(xs)) 94 | ymax = min(height, max(ys)) 95 | 96 | try: 97 | crop_img = img[ymin:ymax, xmin:xmax, :] 98 | target_img_path = f"{img_id}_{idx}.jpg" 99 | p_gt_file.write(target_img_path + "\t" + text + "\n") 100 | cv2.imwrite(os.path.join(target_dir, target_img_path), crop_img) 101 | except Exception as err: 102 | print(err) 103 | print(f"img_id: {img_id} bbox: {vs} img_shape: {img.shape}") 104 | exit() 105 | 106 | p_gt_file.close() 107 | with open(args.character_list_path, "w") as f: 108 | character_set = sorted(list(character_set)) 109 | for c in character_set: 110 | f.write(f"{c}\n") -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/preprocess_HMDB.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | MKDIR(){ 22 | if [ ! -d $1 ]; then 23 | mkdir -p $1 24 | fi 25 | } 26 | 27 | WORKER_CNT=4 28 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 29 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 30 | OF_IMG_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 31 | OF_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 32 | 33 | 34 | RUN_WORKERS(){ 35 | for((i=0;i<$WORKER_CNT;i++)); do 36 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 37 | python ./convert_dataset.py --input_video ${VIDEO_LIST[i]} --output_folder ${RGB_PATH_LIST[i]} \ 38 | && 39 | ./AppOFCuda --input=${RGB_PATH_LIST[i]}/"*.png" --output=${OF_PATH_LIST[i]}/"flow" --preset=fast --gridSize=1 \ 40 | && 41 | python ./convert_of.py --input_flow_folder ${OF_PATH_LIST[i]} --output_folder ${OF_IMG_PATH_LIST[i]} & 42 | fi 43 | done 44 | wait 45 | for((i=0;i<$WORKER_CNT;i++)); do 46 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 47 | rm -r ${OF_PATH_LIST[i]} 48 | fi 49 | done 50 | } 51 | 52 | if [ $# -ne 2 ]; then 53 | echo "USAGE:./preprocess_HMDB.sh [hmdb_dir] [output_top_dir]" 54 | exit 1 55 | else 56 | HMDB_TOP_DIR=$1 57 | OUTPUT_TOP_DIR=$2 58 | echo $HMDB_TOP_DIR 59 | echo $OUTPUT_TOP_DIR 60 | TEMP_DIR="./tmp" 61 | MKDIR $TEMP_DIR 62 | MKDIR $OUTPUT_TOP_DIR 63 | fi 64 | 65 | # 1st stage: unrar rar package: 66 | # for class in $HMDB_TOP_DIR/*; do 67 | # unrar x $class $TEMP_DIR > /dev/null & 68 | # done 69 | 70 | # 2nd stage: Clip video and generate optical flow out of it 71 | for class in $HMDB_TOP_DIR/*; do 72 | CLASS_NAME=$(echo $(basename $class) | cut -d . -f1) 73 | echo "Preprocess $CLASS_NAME" 74 | cnt=0 75 | # extract the frames 76 | for video in $HMDB_TOP_DIR/$CLASS_NAME/*; do 77 | VIDEO_NAME=$(echo $(basename $video) | cut -d . -f1) 78 | RGB_PATH=$OUTPUT_TOP_DIR/$CLASS_NAME/$VIDEO_NAME/"rgb" 79 | OF_PATH=$OUTPUT_TOP_DIR/$CLASS_NAME/$VIDEO_NAME/"of" 80 | OF_IMG_PATH=$OUTPUT_TOP_DIR/$CLASS_NAME/$VIDEO_NAME/ 81 | MKDIR $RGB_PATH 82 | MKDIR $OF_PATH 83 | VIDEO_LIST[$cnt]=$video 84 | RGB_PATH_LIST[$cnt]=$RGB_PATH 85 | OF_PATH_LIST[$cnt]=$OF_PATH 86 | OF_IMG_PATH_LIST[$cnt]=$OF_IMG_PATH 87 | 88 | cnt=$((cnt + 1)) 89 | if [ $cnt -eq $WORKER_CNT ]; then 90 | cnt=0 91 | RUN_WORKERS 92 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 93 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 94 | OF_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 95 | OF_IMG_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 96 | fi 97 | done 98 | if [ $cnt -ne 0 ]; then 99 | RUN_WORKERS 100 | fi 101 | done 102 | 103 | -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/preprocess_SHAD_RGB.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | MKDIR(){ 22 | if [ ! -d $1 ]; then 23 | mkdir -p $1 24 | fi 25 | } 26 | 27 | WORKER_CNT=4 28 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 29 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 30 | #OF_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 31 | #OF_U_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 32 | #OF_V_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 33 | ANNO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 34 | TEMP_VIDEO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 35 | 36 | 37 | RUN_WORKERS(){ 38 | for((i=0;i<$WORKER_CNT;i++)); do 39 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 40 | python3 ./convert_dataset.py --input_video ${VIDEO_LIST[i]} --output_folder ${RGB_PATH_LIST[i]} \ 41 | && python3 ./save_tracks_shad.py --anno_folder ${ANNO_PATH_LIST[i]} --image_folder ${RGB_PATH_LIST[i]} \ 42 | --of_folder ./ --output_folder $TEMP_DIR_ & 43 | fi 44 | done 45 | wait 46 | for((i=0;i<$WORKER_CNT;i++)); do 47 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 48 | rm -r ${TEMP_VIDEO_PATH_LIST[i]} 49 | fi 50 | done 51 | } 52 | 53 | if [ $# -ne 2 ]; then 54 | echo "USAGE:./preprocess_SHAD_RGB.sh [shad_dataset_top_dir] [output_top_dir]" 55 | exit 1 56 | else 57 | SHAD_TOP_DIR=$1 58 | OUTPUT_TOP_DIR=$2 59 | echo $SHAD_TOP_DIR 60 | echo $OUTPUT_TOP_DIR 61 | TEMP_DIR="./tmp" 62 | TEMP_DIR_="./tmp_" 63 | MKDIR $TEMP_DIR 64 | MKDIR $TEMP_DIR_ 65 | MKDIR $OUTPUT_TOP_DIR 66 | fi 67 | 68 | # 1st stage: Clip video and generate optical flow out of it 69 | for class in $SHAD_TOP_DIR/*; do 70 | if [ ! -d $class/"video"/ ]; then 71 | echo "Please use original SHAD dataset" 72 | exit 1 73 | fi 74 | echo "Preprocess $class" 75 | CLASS_NAME=$(basename $class) 76 | cnt=0 77 | for video in $class/"video"/*; do 78 | VIDEO_NAME=$(echo $(basename $video) | cut -d . -f1) 79 | ANNO_PATH=$class/"Annotations"/$VIDEO_NAME 80 | RGB_PATH=$TEMP_DIR/$CLASS_NAME/$VIDEO_NAME/"rgb" 81 | TEMP_VIDEO_PATH=$TEMP_DIR/$CLASS_NAME/$VIDEO_NAME 82 | MKDIR $RGB_PATH 83 | VIDEO_LIST[$cnt]=$video 84 | ANNO_PATH_LIST[$cnt]=$ANNO_PATH 85 | RGB_PATH_LIST[$cnt]=$RGB_PATH 86 | TEMP_VIDEO_PATH_LIST[$cnt]=$TEMP_VIDEO_PATH 87 | 88 | cnt=$((cnt + 1)) 89 | if [ $cnt -eq $WORKER_CNT ]; then 90 | cnt=0 91 | RUN_WORKERS 92 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 93 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 94 | ANNO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 95 | TEMP_VIDEO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 96 | fi 97 | done 98 | if [ $cnt -ne 0 ]; then 99 | RUN_WORKERS 100 | fi 101 | done 102 | 103 | rm -r $TEMP_DIR 104 | 105 | python generate_new_dataset_format.py $TEMP_DIR_ $OUTPUT_TOP_DIR 106 | 107 | rm -r $TEMP_DIR_ 108 | -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/convert_of.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import argparse 22 | import cv2 23 | import numpy as np 24 | import os 25 | 26 | 27 | def parse_flow(flow_file): 28 | """Parse the optical flow vector generated from NVOF SDK.""" 29 | 30 | with open(flow_file, "rb") as f: 31 | _ = f.read(4) 32 | width = int.from_bytes(f.read(4), byteorder="little", signed=False) 33 | height = int.from_bytes(f.read(4), byteorder="little", signed=False) 34 | data = f.read() 35 | of_flatten = np.frombuffer(data, dtype=np.float32) 36 | of_array = np.reshape(of_flatten, (height, width, 2)) 37 | of_array = of_array.transpose((2, 0, 1)) 38 | flow_x = np.squeeze(of_array[0, :, :]) 39 | flow_y = np.squeeze(of_array[1, :, :]) 40 | 41 | return flow_x, flow_y 42 | 43 | 44 | def minmax_grayscale(flow_x, flow_y): 45 | """Map the flow to grayscale images. The map method follows I3D""" 46 | higher_end = 20.0 47 | lower_end = -20.0 48 | flow_x = np.maximum(np.minimum(255.0, 255.0 * ((flow_x - lower_end) / (higher_end - lower_end))), 0.0) 49 | flow_y = np.maximum(np.minimum(255.0, 255.0 * ((flow_y - lower_end) / (higher_end - lower_end))), 0.0) 50 | 51 | img_x = np.array(np.around(flow_x), dtype=np.uint8) 52 | img_y = np.array(np.around(flow_y), dtype=np.uint8) 53 | 54 | return img_x, img_y 55 | 56 | 57 | def max_rad_grayscale(flow_x, flow_y): 58 | """Map the flow to grayscale images. Normalize vector using max_rad""" 59 | max_rad = 1.0 60 | rad = np.sqrt(flow_x * flow_x + flow_y * flow_y) 61 | max_rad = max(max_rad, rad.max()) 62 | 63 | img_x = np.array((flow_x / max_rad) * 127.999 + 128, dtype=np.uint8) 64 | img_y = np.array((flow_y / max_rad) * 127.999 + 128, dtype=np.uint8) 65 | 66 | return img_x, img_y 67 | 68 | 69 | def convert(input_flow_folder, output_folder): 70 | """Convert the flow in input_flow floder to grayscale images""" 71 | 72 | u_img_root = os.path.join(output_folder, "u") 73 | v_img_root = os.path.join(output_folder, "v") 74 | if not os.path.exists(u_img_root): 75 | os.makedirs(u_img_root) 76 | if not os.path.exists(v_img_root): 77 | os.makedirs(v_img_root) 78 | 79 | for flow_name in os.listdir(input_flow_folder): 80 | frame_id = str(int(flow_name.split("_")[1]) + 1).zfill(6) 81 | flow_file_path = os.path.join(input_flow_folder, flow_name) 82 | flow_x, flow_y = parse_flow(flow_file_path) 83 | img_x, img_y = max_rad_grayscale(flow_x, flow_y) 84 | 85 | img_x_path = os.path.join(u_img_root, frame_id+".jpg") 86 | img_y_path = os.path.join(v_img_root, frame_id+".jpg") 87 | 88 | cv2.imwrite(img_x_path, img_x) 89 | cv2.imwrite(img_y_path, img_y) 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser(description='Convert raw optical flow vectors to grayscale images') 94 | parser.add_argument('--input_flow_folder', type=str, help='input optical flow path', required=True) 95 | parser.add_argument('--output_folder', type=str, help='output images path', required=True) 96 | args = parser.parse_args() 97 | 98 | convert(args.input_flow_folder, args.output_folder) -------------------------------------------------------------------------------- /tao_action_recognition/data_generation/preprocess_SHAD.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | MKDIR(){ 22 | if [ ! -d $1 ]; then 23 | mkdir -p $1 24 | fi 25 | } 26 | 27 | WORKER_CNT=4 28 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 29 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 30 | OF_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 31 | OF_IMG_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 32 | ANNO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 33 | TEMP_VIDEO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 34 | 35 | 36 | RUN_WORKERS(){ 37 | for((i=0;i<$WORKER_CNT;i++)); do 38 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 39 | python3 ./convert_dataset.py --input_video ${VIDEO_LIST[i]} --output_folder ${RGB_PATH_LIST[i]} \ 40 | && ./AppOFCuda --input=${RGB_PATH_LIST[i]}/"*.png" --output=${OF_PATH_LIST[i]}/"flow" --preset=slow --gridSize=1 \ 41 | && python3 ./convert_of.py --input_flow_folder=${OF_PATH_LIST[i]} --output_folder=${OF_IMG_PATH_LIST[i]} \ 42 | && python3 ./save_tracks_shad.py --anno_folder ${ANNO_PATH_LIST[i]} --image_folder ${RGB_PATH_LIST[i]} \ 43 | --of_folder ${OF_IMG_PATH_LIST[i]} --output_folder $TEMP_DIR_ & 44 | fi 45 | done 46 | wait 47 | for((i=0;i<$WORKER_CNT;i++)); do 48 | if [ ${VIDEO_LIST[i]} != "NULL" ]; then 49 | rm -r ${TEMP_VIDEO_PATH_LIST[i]} 50 | fi 51 | done 52 | } 53 | 54 | if [ $# -ne 2 ]; then 55 | echo "USAGE:./preprocess_SHAD.sh [shad_dataset_top_dir] [output_top_dir]" 56 | exit 1 57 | else 58 | SHAD_TOP_DIR=$1 59 | OUTPUT_TOP_DIR=$2 60 | echo $SHAD_TOP_DIR 61 | echo $OUTPUT_TOP_DIR 62 | TEMP_DIR="./tmp" 63 | TEMP_DIR_="./tmp_" 64 | MKDIR $TEMP_DIR 65 | MKDIR $TEMP_DIR_ 66 | MKDIR $OUTPUT_TOP_DIR 67 | fi 68 | 69 | # 1st stage: Clip video and generate optical flow out of it 70 | for class in $SHAD_TOP_DIR/*; do 71 | if [ ! -d $class/"video"/ ]; then 72 | echo "Please use original SHAD dataset" 73 | exit 1 74 | fi 75 | echo "Preprocess $class" 76 | CLASS_NAME=$(basename $class) 77 | cnt=0 78 | for video in $class/"video"/*; do 79 | VIDEO_NAME=$(echo $(basename $video) | cut -d . -f1) 80 | ANNO_PATH=$class/"Annotations"/$VIDEO_NAME 81 | RGB_PATH=$TEMP_DIR/$CLASS_NAME/$VIDEO_NAME/"rgb" 82 | OF_PATH=$TEMP_DIR/$CLASS_NAME/$VIDEO_NAME/"of" 83 | OF_IMG_PATH=$TEMP_DIR/$CLASS_NAME/$VIDEO_NAME/"of_img" 84 | TEMP_VIDEO_PATH=$TEMP_DIR/$CLASS_NAME/$VIDEO_NAME 85 | MKDIR $RGB_PATH 86 | MKDIR $OF_PATH 87 | MKDIR $OF_IMG_PATH 88 | VIDEO_LIST[$cnt]=$video 89 | ANNO_PATH_LIST[$cnt]=$ANNO_PATH 90 | RGB_PATH_LIST[$cnt]=$RGB_PATH 91 | OF_PATH_LIST[$cnt]=$OF_PATH 92 | OF_IMG_PATH_LIST[$cnt]=$OF_IMG_PATH 93 | TEMP_VIDEO_PATH_LIST[$cnt]=$TEMP_VIDEO_PATH 94 | 95 | cnt=$((cnt + 1)) 96 | if [ $cnt -eq $WORKER_CNT ]; then 97 | cnt=0 98 | RUN_WORKERS 99 | VIDEO_LIST=("NULL" "NULL" "NULL" "NULL") 100 | RGB_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 101 | OF_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 102 | OF_IMG_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 103 | ANNO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 104 | TEMP_VIDEO_PATH_LIST=("NULL" "NULL" "NULL" "NULL") 105 | fi 106 | done 107 | if [ $cnt -ne 0 ]; then 108 | RUN_WORKERS 109 | fi 110 | done 111 | 112 | rm -r $TEMP_DIR 113 | 114 | python generate_new_dataset_format.py $TEMP_DIR_ $OUTPUT_TOP_DIR 115 | 116 | rm -r $TEMP_DIR_ 117 | -------------------------------------------------------------------------------- /tao_key_points_estimation/tensorrt_inference/trt_inference/engine.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import os 18 | import numpy as np 19 | import tensorrt as trt 20 | import pycuda.autoinit 21 | import pycuda.driver as cuda 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 26 | 27 | 28 | # Array of TensorRT loggers. We need to keep global references to 29 | # the TensorRT loggers that we create to prevent them from being 30 | # garbage collected as those are referenced from C++ code without 31 | # Python knowing about it. 32 | 33 | 34 | tensorrt_loggers = [] 35 | 36 | 37 | def _create_tensorrt_logger(verbose=False): 38 | """Create a TensorRT logger. 39 | 40 | Args: 41 | verbose (bool): whether to make the logger verbose. 42 | """ 43 | if verbose: 44 | # trt_verbosity = trt.Logger.Severity.INFO 45 | trt_verbosity = trt.Logger.Severity.VERBOSE 46 | else: 47 | trt_verbosity = trt.Logger.Severity.WARNING 48 | tensorrt_logger = trt.Logger(trt_verbosity) 49 | tensorrt_loggers.append(tensorrt_logger) 50 | return tensorrt_logger 51 | 52 | 53 | class HostDeviceMem(object): 54 | def __init__(self, host_mem, device_mem, binding_name, shape=None): 55 | self.host = host_mem 56 | self.device = device_mem 57 | self.binding_name = binding_name 58 | self.shape = shape 59 | 60 | def __str__(self): 61 | return "Host:\n" + str(self.host) + "\nDevice\n" + str(self.device) 62 | 63 | def __repr__(self): 64 | return self.__str__() 65 | 66 | 67 | def allocate_buffers(engine, context): 68 | 69 | inputs = [] 70 | outputs = [] 71 | bindings = [] 72 | stream = cuda.Stream() 73 | for binding in engine: 74 | binding_id = engine.get_binding_index(str(binding)) 75 | size = trt.volume(context.get_binding_shape(binding_id)) * engine.max_batch_size 76 | print("{}:{}".format(binding, size)) 77 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 78 | host_mem = cuda.pagelocked_empty(size, dtype) 79 | device_mem = cuda.mem_alloc(host_mem.nbytes) 80 | bindings.append(int(device_mem)) 81 | if engine.binding_is_input(binding): 82 | inputs.append(HostDeviceMem(host_mem, device_mem, binding)) 83 | else: 84 | output_shape = engine.get_binding_shape(binding) 85 | if len(output_shape) == 3: 86 | dims = trt.Dims3(engine.get_binding_shape(binding)) 87 | output_shape = (engine.max_batch_size, dims[0], dims[1], dims[2]) 88 | elif len(output_shape) == 2: 89 | dims = trt.Dims2(output_shape) 90 | output_shape = (engine.max_batch_size, dims[0], dims[1]) 91 | outputs.append(HostDeviceMem(host_mem, device_mem, binding, output_shape)) 92 | 93 | return inputs, outputs, bindings, stream 94 | 95 | 96 | def do_inference(batch, context, bindings, inputs, outputs, stream): 97 | batch_size = batch.shape[0] 98 | assert len(inputs) == 1 99 | inputs[0].host = np.ascontiguousarray(batch, dtype=np.float32) 100 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 101 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) 102 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 103 | stream.synchronize() 104 | 105 | outputs_dict = {} 106 | outputs_shape = {} 107 | for out in outputs: 108 | outputs_dict[out.binding_name] = np.reshape(out.host, out.shape) 109 | outputs_shape[out.binding_name] = out.shape 110 | 111 | return outputs_shape, outputs_dict 112 | 113 | 114 | def load_tensorrt_engine(filename, verbose=False): 115 | tensorrt_logger = _create_tensorrt_logger(verbose) 116 | 117 | if not os.path.exists(filename): 118 | raise ValueError("{} does not exits".format(filename)) 119 | 120 | with trt.Runtime(tensorrt_logger) as runtime, open(filename, "rb") as f: 121 | trt_engine = runtime.deserialize_cuda_engine(f.read()) 122 | 123 | return trt_engine 124 | -------------------------------------------------------------------------------- /tao_classification/deploy_to_deepstream/README.md: -------------------------------------------------------------------------------- 1 | # Deploy Classification model to Deepstream 2 | Some tips to deploy TAO Classification model to Deepstream. 3 | 4 | # Deploy Classification model as primary tensorrt engine 5 | There are two ways of deploying classification model in deepstream. 6 | One is working as primary tensorrt engine, anohter is working as secondary tensorrt engine. 7 | 8 | 9 | ## Detailed Steps 10 | 11 | * Create ds_classification_as_primary_gie.txt. Refer to [link](https://forums.developer.nvidia.com/t/issue-with-image-classification-tutorial-and-testing-with-deepstream-app/165835/12?u=morganh) 12 | 13 | Below is a snippet of the config file. 14 | 15 | ``` 16 | # config-file property is mandatory for any gie section. 17 | # Other properties are optional and if set will override the properties set in 18 | # the infer config file. 19 | [primary-gie] 20 | enable=1 21 | gpu-id=0 22 | #model-engine-file=your_classification.engine 23 | batch-size=1 24 | #Required by the app for OSD, not a plugin property 25 | bbox-border-color0=1;0;0;1 26 | bbox-border-color1=0;1;1;1 27 | bbox-border-color2=0;0;1;1 28 | bbox-border-color3=0;1;0;1 29 | interval=0 30 | gie-unique-id=1 31 | nvbuf-memory-type=0 32 | config-file=config_as_primary_gie.txt 33 | ``` 34 | 35 | * Create config_as_primary_gie.txt. 36 | ``` 37 | [property] 38 | gpu-id=0 39 | net-scale-factor=1.0 40 | #below offsets=b,g,r which can be also changed according to the "image_mean" in your training spec file. 41 | offsets=123.67;116.28;103.53 42 | model-color-format=1 43 | batch-size= 30 44 | 45 | tlt-model-key=yourkey 46 | tlt-encoded-model=your_unpruned_or_pruned_model.etlt 47 | labelfile-path=labels.txt 48 | #int8-calib-file=cal.bin 49 | #model-engine-file=your_classification.engine 50 | #input-dims=c;h;w;0. Can be also changed according to the "input_image_size" in your training spec file. 51 | input-dims=3;224;224;0 52 | uff-input-blob-name=input_1 53 | output-blob-names=predictions/Softmax 54 | 55 | # process-mode: 2 - inferences on crops from primary detector, 1 - inferences on whole frame 56 | process-mode=1 57 | ## 0=FP32, 1=INT8, 2=FP16 mode 58 | network-mode=0 59 | 60 | network-type=1 # defines that the model is a classifier. 61 | num-detected-classes=2 62 | interval=0 63 | gie-unique-id=1 64 | #threshold=0.05 65 | classifier-async-mode=1 66 | classifier-threshold=0.2 67 | operate-on-gie-id=1 68 | #operate-on-class-ids=0 69 | ``` 70 | 71 | * Run deepstream-app 72 | ``` 73 | $ deepstream-app -c ds_classification_as_primary_gie.txt 74 | ``` 75 | 76 | # Deploy Classification model as secondary tensorrt engine 77 | 78 | ## Detailed Steps 79 | 80 | * Create ds_classification_as_secondary_gie.txt. Refer to [link](https://forums.developer.nvidia.com/t/issue-with-image-classification-tutorial-and-testing-with-deepstream-app/165835/12?u=morganh) 81 | 82 | Below is a snippet of the config file. 83 | 84 | ``` 85 | [secondary-gie3] 86 | enable=1 87 | #model-engine-file=your_classification.engine 88 | batch-size=4 89 | gpu-id=0 90 | gie-unique-id=7 91 | operate-on-gie-id=1 92 | #operate-on-class-ids=0; 93 | config-file=config_as_secondary_gie.txt 94 | ``` 95 | 96 | * Create config_as_secondary_gie.txt. 97 | 98 | ``` 99 | [property] 100 | gpu-id=0 101 | net-scale-factor=1.0 102 | #below offsets=b,g,r which can be also changed according to the "image_mean" in your training spec file. 103 | offsets=123.67;116.28;103.53 104 | model-color-format=1 105 | batch-size= 30 106 | 107 | tlt-model-key=yourkey 108 | tlt-encoded-model=your_unpruned_or_pruned_model.etlt 109 | labelfile-path=labels.txt 110 | #int8-calib-file=cal.bin 111 | #model-engine-file=your_classification.engine 112 | #input-dims=c;h;w;0. Can be also changed according to the "input_image_size" in your training spec file. 113 | input-dims=3;224;224;0 114 | uff-input-blob-name=input_1 115 | output-blob-names=predictions/Softmax 116 | 117 | # process-mode: 2 - inferences on crops from primary detector, 1 - inferences on whole frame 118 | process-mode=2 119 | ## 0=FP32, 1=INT8, 2=FP16 mode 120 | network-mode=0 121 | 122 | network-type=1 # defines that the model is a classifier. 123 | num-detected-classes=2 124 | interval=0 125 | gie-unique-id=1 126 | #threshold=0.05 127 | classifier-async-mode=1 128 | classifier-threshold=0.2 129 | operate-on-gie-id=1 130 | #operate-on-class-ids=0 131 | ``` 132 | 133 | * Run deepstream-app 134 | ``` 135 | $ deepstream-app -c ds_classification_as_secondary_gie.txt 136 | ``` 137 | 138 | # Other tips 139 | ## Generate avi video file as input test file. It is better than mp4 input file. 140 | ``` 141 | gst-launch-1.0 multifilesrc location="/tmp/%d.jpg" caps=“image/jpeg,framerate=30/1” ! jpegdec ! x264enc ! avimux ! filesink location=“out.avi” 142 | ``` 143 | 144 | ## Change "scaling-filter". More info can be found in [DeepStream Gst-nvinfer Plugin](https://docs.nvidia.com/metropolis/deepstream/dev-guide/text/DS_plugin_gst-nvinfer.html#gst-nvinfer-file-configuration-specifications) 145 | ``` 146 | scaling-filter=5 147 | ``` 148 | -------------------------------------------------------------------------------- /tao_api/how_to_modify_code_for_TAO_API.md: -------------------------------------------------------------------------------- 1 | # How to modify code for TAO API 2 | 3 | This guide help you go through the detailed steps of how to modify code and generate new docker for TAO API. 4 | 5 | 6 | ## Trigger docker in one host machine and modify code 7 | Open a terminal, trigger 4.0.0 tao api docker. 8 | ```shell 9 | $ docker run -it --name tao-api-fixed nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api /bin/bash 10 | ``` 11 | 12 | Open another terminal. You need to build a new docker image based on nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api. 13 | First, create a folder, copy the file from the container to your local host, and modify it as below. 14 | ``` 15 | nvidia@host:~$ mkdir docker_build && cd docker_build 16 | nvidia@host:~/docker_build$ docker cp tao-api-fixed:/opt/api ./ 17 | nvidia@host:~/docker_build$ cd api 18 | nvidia@host:~/docker_build/api$ vim handlers/actions.py 19 | ``` 20 | 21 | Go to line:779 and change the code from 22 | ```shell 23 | if find_trained_weight == []: 24 | if not ptm_id == "": 25 | model_dir = f"/shared/users/00000000-0000-0000-0000-000000000000/models/{ptm_id}" 26 | if job_context.network == "lprnet": 27 | pretrained_model_file = glob.glob(model_dir+"/*/*.tlt") 28 | else: 29 | pretrained_model_file = glob.glob(model_dir+"/*/*.hdf5") 30 | else: 31 | find_trained_weight.sort(reverse=False) 32 | trained_weight = find_trained_weight[0] 33 | ``` 34 | to 35 | 36 | ```shell 37 | if find_trained_weight == []: 38 | if not ptm_id == "": 39 | model_dir = f"/shared/users/00000000-0000-0000-0000-000000000000/models/{ptm_id}" 40 | pretrained_model_file = [] 41 | pretrained_model_file = glob.glob(model_dir+"/*/*.hdf5") + glob.glob(model_dir+"/*/*.tlt") 42 | if len(pretrained_model_file) > 1: 43 | pretrained_model_file = pretrained_model_file[0] 44 | 45 | assert pretrained_model_file != [], "error pretrained_model_file" 46 | else: 47 | find_trained_weight.sort(reverse=False) 48 | trained_weight = find_trained_weight[0] 49 | ``` 50 | 51 | 52 | Change docker_images.py and change the code 53 | ```shell 54 | nvidia@host:~/docker_build/api$ vim handlers/docker_images.py 55 | ``` 56 | Go to line 23 and replace the docker image name from 57 | ```shell 58 | "api": os.getenv('IMAGE_API', default='nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api') 59 | ``` 60 | To 61 | 62 | ```shell 63 | "api": os.getenv('IMAGE_API', default='nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api-fix') 64 | ``` 65 | 66 | ## Generate a new docker 67 | Create a Dockerfile 68 | ```shell 69 | nvidia@host:~/docker_build/api$ mv Dockerfile Dockerfile_bak 70 | nvidia@host:~/docker_build/api$ vim Dockerfile 71 | ``` 72 | 73 | Below is the content of Dockerfile 74 | ```shell 75 | nvidia@host:~/docker_build/api$ cat Dockerfile 76 | ################ BUILD IMAGE ################# 77 | FROM nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api 78 | # Copy project files 79 | WORKDIR /opt/api 80 | COPY handlers/actions.py handlers/actions.py 81 | COPY handlers/docker_images.py handlers/docker_images.py 82 | ENV PATH=“/opt/ngccli/ngc-cli:${PATH}” 83 | # Default command 84 | CMD /bin/bash app_start.sh 85 | ``` 86 | 87 | ```shell 88 | nvidia@host:~/docker_build/api$ docker build . -t nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api-fix 89 | ``` 90 | 91 | ## Save the docker to tar file 92 | ```shell 93 | $ docker save -o tao-api.tar nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api-fix 94 | ``` 95 | 96 | Copy the tar file to k8s machine 97 | ```shell 98 | $ scp tao-api.tar ip_k8s_machine:/path/to/save 99 | ``` 100 | 101 | ## Import the new image 102 | In k8s machines, 103 | ```shell 104 | $ sudo ctr -n=k8s.io image import tao-api.tar 105 | ``` 106 | 107 | ## Delete the old pods 108 | Delete existing tao-toolkit-api pods 109 | ```shell 110 | $ helm delete tao-toolkit-api 111 | ``` 112 | 113 | ## Download chart and modify 114 | Download latest helm chart. 115 | ```shell 116 | $ helm fetch https://helm.ngc.nvidia.com/nvidia/tao/charts/tao-toolkit-api-4.0.2.tgz --username=‘$oauthtoken’ --password= 117 | $ mkdir tao-toolkit-api && tar -zxvf tao-toolkit-api-4.0.2.tgz -C tao-toolkit-api 118 | $ cd tao-toolkit-api/ 119 | ``` 120 | 121 | Modify the image name. 122 | ```shell 123 | $ vi tao-toolkit-api/values.yaml 124 | 125 | # in line 2 126 | From 127 | image: nvcr.io/nvidia/tao/tao-toolkit:4.0.2-api 128 | To 129 | image: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-api-fix 130 | 131 | #in line 4 132 | From 133 | imagePullPolicy: Always 134 | To 135 | imagePullPolicy: IfNotPresent 136 | ``` 137 | 138 | ## Install latest chart 139 | ```shell 140 | $ helm install tao-toolkit-api tao-toolkit-api/ --namespace default 141 | ``` 142 | 143 | Verify the latest code inside the docker 144 | ```shell 145 | $ kubectl get pods 146 | $ kubectl exec -it tao-toolkit-api-app-pod-5d4d74c65c-k8zt5 -- /bin/bash 147 | root@tao-toolkit-api-app-pod-5d4d74c65c-k8zt5:/opt/api# apt-get install vim 148 | root@tao-toolkit-api-app-pod-5d4d74c65c-k8zt5:/opt/api# vim handlers/actions.py 149 | ``` 150 | -------------------------------------------------------------------------------- /tao_forum_faq/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## FPENet 4 | 1. *Why is the bounding box recalculated just using the key points when I have also supplied the face bbox ground truth in the annotation file ? What is the purpose of the bbox in the ground truth file?* 5 | 6 | The annotation file just provide all the keypoints. FPEnet will find the xmin, ymin, xmax, ymax of the points and then calculate a square face bounding box based on the key points. And then crop bounding box from image and scale the Keypoints to target resolution 7 | 8 | ## Emotionnet 9 | 1. *How to find the input name of EmotionNet?* 10 | ``` 11 | tao-converter model.etlt 12 | -k nvidia_tlt 13 | -t fp32 14 | -p input_landmarks:0,1x1x136x1,1x1x136x1,2x1x136x1 15 | -e model.engine 16 | ``` 17 | 18 | ## tlt or etlt 19 | 1. *How to decode etlt file?* 20 | ``` 21 | $ docker run --runtime=nvidia -it --rm -v /home/morganh:/home/morganh nvcr.io/nvidia/tao/tao-toolkit:5.0.0-tf1.15.5 /bin/bash 22 | # wget --content-disposition ‘https://api.ngc.nvidia.com/v2/models/org/nvidia/team/tao/fpenet/deployable_v1.0/files?redirect=true&path=model.etlt’ -O fpenet_model_v1.0.etlt 23 | ``` 24 | 25 | Generate deocde_etlt.py file as below. 26 | ``` 27 | import argparse 28 | import struct 29 | from nvidia_tao_tf1.encoding import encoding 30 | 31 | def parse_command_line(args): 32 | '''Parse command line arguments.''' 33 | parser = argparse.ArgumentParser(description='ETLT Decode Tool') 34 | parser.add_argument('-m', 35 | '--model', 36 | type=str, 37 | required=True, 38 | help='Path to the etlt file.') 39 | parser.add_argument('-o', 40 | '--uff', 41 | required=True, 42 | type=str, 43 | help='The path to the uff file.') 44 | parser.add_argument('-k', 45 | '--key', 46 | required=True, 47 | type=str, 48 | help='encryption key.') 49 | return parser.parse_args(args) 50 | 51 | 52 | def decode(tmp_etlt_model, tmp_uff_model, key): 53 | with open(tmp_uff_model, 'wb') as temp_file, open(tmp_etlt_model, 'rb') as encoded_file: 54 | size = encoded_file.read(4) 55 | size = struct.unpack(" --target_dir 19 | ``` 20 | 21 | Assume that the paths from inside the TAO Toolkit container to the dataset are as follows: 22 | 23 | ```shell 24 | /home//tao-experiments/data/imagenet2012/train 25 | /home//tao-experiments/data/imagenet2012/val 26 | ``` 27 | 28 | The first path is a directory that contains all the training images, where each of the 1K classes has its own subdirectory. The same is assumed for the validation split as well. The structure of the classification dataset follows the [TAO Toolkit classification model training requirements](https://docs.nvidia.com/tao/tao-toolkit/text/data_annotation_format.html#image-classification-format). 29 | 30 | ### Training specification: 31 | For every TAO Toolkit model training, you have a configuration file (spec file) to configure some necessary parameters used to customize the model and the training process. Please refer to 32 | [classification_cspdarknet53.txt](specs/classification_cspdarknet53.txt) for the training spec file. 33 | 34 | ### Start Training: 35 | Run following command to start training on 8 GPUs: 36 | 37 | ``` 38 | tao classification train --gpus 8 -e -r -k nvidia_tao 39 | ``` 40 | For 8 x A100 GPUs, the training will require about ~30 hours. And you might get around 78.3% of val accuracy. 41 | 42 | 43 | ## Train YOLOV4-CSPDarkNet53 on COCO14 dataset 44 | In this section, you will train YOLOV4 on COCO14 dataset with imagenet pretrained CSPDarkNet53 backbone you create in the first section. 45 | 46 | ### Prepare COCO14 dataset 47 | Firstly, you will prepare the COCO14 dataset for training. To compare with the SOTA model, you will do the training/testing split the same way as the original YOLOV4. And this split is different from the official split of COCO14. You could download the original COCO14 dataset and the split images list (5k.txt/trainvalno5k.txt)by running [get_coco_dataset.sh](https://raw.githubusercontent.com/AlexeyAB/darknet/master/scripts/get_coco_dataset.sh) 48 | 49 | After the downloading dataset, you should convert the json format labels to KITTI format by using [coco2kitti.py](https://github.com/NVIDIA-AI-IOT/deepstream_tao_apps/blob/release/tao3.0/misc/dev_blog/SOTA/dataset_tools/coco2kitti.py) 50 | 51 | ```shell 52 | # Convert instances_train2014.json to KITTI format 53 | python3 ./coco2kitti.py train2014 54 | mv ./labels ./train2014_KITTI 55 | 56 | # Convert instances_val2014.json to KITTI format 57 | python3 ./coco2kitti.py val2014 58 | mv ./labels ./val2014_KITTI 59 | ``` 60 | Once you get the images and KITTI format labels, you could re-split them according to 5k.txt / trainvalno5k.txt. 61 | 62 | ### Training specification: 63 | Before we start the training, there are 3 more steps to do to get a better results on the dataset. 64 | 65 | #### Generate anchor setting: 66 | TAO Toolkit YOLOV4 supports the ground truth bboxes clustering to find suitable anchor setting on a specific dataset: 67 | 68 | ```shell 69 | tao yolo_v4 kmeans -l \ 70 | -i \ 71 | -x \ 72 | -y 73 | ``` 74 | 75 | You can replace the `small_anchor_shape`, `mid_anchor_shape`, `big_anchor_shape` in `yolov4_config` with the generated anchors shapes. 76 | 77 | #### Enable model EMA 78 | TAO Toolkit YOLOV4 supports the model exponential moving average (EMA) during the training. Enable it by setting `model_ema: true` in the `train_config`. 79 | 80 | You can also do some hyperparameters (learning rate, learning rate schduler, regularization factor) search by using part of dataset or less epochs. Here we provide a [spec](specs/yolov4_416_coco14.txt) to train YOLOV4-416-Leaky on COCO14 YOLO split. In this spec, we use raw KITTI-style labels for training. 81 | 82 | ### Start Training: 83 | Run following command to start training on 8 GPUs: 84 | 85 | ``` 86 | tao yolo_v4 train --gpus=8 -e -r -k nvidia_tao 87 | ``` 88 | The training will require ~130 hours on 8 V100 16G. And you might get around mAP@0.5 60.9% using COCO-style metrics. 89 | -------------------------------------------------------------------------------- /tao_key_points_estimation/tensorrt_inference/fpenet_trt_inference.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import argparse 17 | import cv2 18 | from trt_inference import allocate_buffers, do_inference, load_tensorrt_engine 19 | import json 20 | import numpy as np 21 | import os 22 | import tqdm 23 | 24 | INPUT_CHANNEL=1 25 | INPUT_WIDTH=80 26 | INPUT_HEIGHT=80 27 | NUM_KEYPOINTS=6 28 | 29 | 30 | def preprocess(sample): 31 | fname = str(sample['filename']) 32 | 33 | for chunk in sample['annotations']: 34 | if 'facebbox' not in chunk['class'].lower(): 35 | continue 36 | 37 | bbox_data = (entry for entry in chunk if ('class' not in entry and 38 | 'version' not in entry)) 39 | for entry in bbox_data: 40 | if 'face_tight_bboxheight' in str(entry).lower(): 41 | height = int(float(chunk[entry])) 42 | if 'face_tight_bboxwidth' in str(entry).lower(): 43 | width = int(float(chunk[entry])) 44 | if 'face_tight_bboxx' in str(entry).lower(): 45 | x = int(float(chunk[entry])) 46 | if 'face_tight_bboxy' in str(entry).lower(): 47 | y = int(float(chunk[entry])) 48 | 49 | image = cv2.imread(os.path.join(fname)) 50 | 51 | image_shape = image.shape 52 | image_height = image_shape[0] 53 | image_width = image_shape[1] 54 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 55 | image = np.float32(image) 56 | 57 | # transform it into a square bbox wrt the longer side 58 | longer_side = max(width, height) 59 | new_width = longer_side 60 | new_height = longer_side 61 | x = int(x - (new_width - width) / 2) 62 | y = int(y - (new_height - height) / 2) 63 | x = min(max(x, 0), image_width) 64 | y = min(max(y, 0), image_height) 65 | new_width = min(new_width, image_width - x) 66 | new_height = min(new_height, image_height - y) 67 | new_width = min(new_width, new_height) 68 | new_height = new_width # make it a square bbox 69 | crop_bbox = [x, y, new_width, new_height] 70 | 71 | # crop the face bounding box 72 | img_crop = image[y:y + new_height, x:x + new_width, :] # pylint:disable=E1136 73 | image_resized = cv2.resize(img_crop, 74 | (INPUT_HEIGHT, INPUT_WIDTH), 75 | interpolation=cv2.INTER_CUBIC) 76 | if INPUT_CHANNEL == 1: 77 | image_resized = cv2.cvtColor(image_resized, cv2.COLOR_BGR2GRAY) 78 | image_resized = np.expand_dims(image_resized, 2) 79 | # make it channel first (channel, height, width) 80 | image_resized = np.transpose(image_resized, (2, 0, 1)) 81 | image_resized = np.expand_dims(image_resized, 0).astype(np.float32) # add batch dimension 82 | 83 | return crop_bbox, image_resized 84 | 85 | 86 | def postprocess(outputs, crop_bbox): 87 | 88 | keypoints = outputs['softargmax/strided_slice:0'] 89 | scale = float(crop_bbox[2]) / INPUT_HEIGHT 90 | shift = np.tile(np.array((crop_bbox[0], crop_bbox[1])), 91 | (NUM_KEYPOINTS, 1)) 92 | result = (keypoints[0, :, :] * scale) + shift 93 | 94 | return result 95 | 96 | 97 | if __name__ == "__main__": 98 | 99 | parser = argparse.ArgumentParser(description='Do FPENet inference using TRT') 100 | parser.add_argument('--input_json', type=str, help='input json path', required=True) 101 | parser.add_argument('--trt_engine', type=str, help='trt engine file path', required=True) 102 | parser.add_argument('--output_img_dir', type=str, help='output imgs save path') 103 | 104 | args = parser.parse_args() 105 | 106 | batch_size = 1 107 | engine_file = args.trt_engine 108 | input_json = args.input_json 109 | output_dir = args.output_img_dir 110 | 111 | 112 | with load_tensorrt_engine(engine_file) as engine: 113 | with engine.create_execution_context() as context: 114 | context.set_binding_shape(0, (1, INPUT_CHANNEL, INPUT_HEIGHT, INPUT_WIDTH)) 115 | inputs, outputs, bindings, stream = allocate_buffers(engine, context) 116 | json_data = json.loads(open(input_json , 'r').read()) 117 | results = [] 118 | for sample in tqdm.tqdm(json_data): 119 | fname = str(sample['filename']) 120 | crop_bbox, img = preprocess(sample) 121 | outputs_shape, outputs_data = do_inference(batch=img, context=context, 122 | bindings=bindings, inputs=inputs, 123 | outputs=outputs, stream=stream) 124 | keypoints = postprocess(outputs_data, crop_bbox) 125 | keypoints = keypoints[0] 126 | img = cv2.imread(fname) 127 | for idx, kp in enumerate(keypoints): 128 | x = kp[0] 129 | y = kp[1] 130 | cv2.circle(img,(int(x), int(y)), 1, (0,255,0), 2) 131 | cv2.putText(img, str(idx), (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,255,0), 1) 132 | cv2.imwrite(os.path.join(output_dir, fname.split("/")[-1]), img) 133 | -------------------------------------------------------------------------------- /tao_action_recognition/tensorrt_inference/ar_trt_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import argparse 22 | from trt_inference import allocate_buffers, do_inference, load_tensorrt_engine 23 | import numpy as np 24 | import PIL 25 | from PIL import Image 26 | import os 27 | 28 | SEQ = 32 29 | CENTER_CROP = False 30 | INPUT_2D = False 31 | 32 | 33 | def preprocess_ds_ncdhw(batch_img): 34 | batch_img_array = np.array([np.array(img) for img in batch_img], dtype=np.float32) 35 | batch_img_array = ((batch_img_array / 255.0) - 0.5) / 0.5 36 | batch_transpose = np.transpose(batch_img_array, (3, 0, 1, 2)) 37 | if INPUT_2D: 38 | batch_reshape = np.reshape(batch_transpose, (3*SEQ, 224, 224)) 39 | else: 40 | batch_reshape = batch_transpose 41 | 42 | return batch_reshape 43 | 44 | 45 | def test_consecutive_sample(max_sample_cnt, seq_length, sample_rate=1): 46 | """Choose the middle consecutive frames of each video.""" 47 | total_frames_req = seq_length * sample_rate 48 | average_duration = max_sample_cnt - total_frames_req + 1 49 | if average_duration > 0: 50 | start_idx = int(average_duration/2.0) 51 | else: 52 | start_idx = 0 53 | 54 | img_ids = start_idx + np.arange(seq_length) * sample_rate 55 | # # loop the video to form sequence: 56 | img_ids = np.mod(img_ids, max_sample_cnt) 57 | 58 | return img_ids 59 | 60 | 61 | def sample_patch(img_root_path, seq_len=SEQ): 62 | img_list = sorted(os.listdir(img_root_path)) 63 | img_id_list = [] 64 | if len(img_list) < seq_len: 65 | img_ids = np.arange(seq_len) 66 | img_ids = np.mod(img_ids, len(img_list)) 67 | img_id_list.append(img_ids) 68 | else: 69 | end_index = len(img_list) - seq_len + 1 70 | for idx in range(end_index): 71 | img_ids = idx + np.arange(seq_len) 72 | img_id_list.append(img_ids) 73 | return img_id_list 74 | 75 | 76 | def resize_and_center_crop(img): 77 | # resize the short side to 224 78 | w, h = img.size 79 | if h <= w: 80 | target_w = int((224.0 / float(h)) * w) 81 | resized_img = img.resize((target_w, 224), resample=PIL.Image.BILINEAR) 82 | else: 83 | target_h = int((224.0 / float(w)) * h) 84 | resized_img = img.resize((224, target_h), resample=PIL.Image.BILINEAR) 85 | 86 | # center crop to 224x224 87 | resized_w, resized_h = resized_img.size 88 | center_x = (resized_w - 224) / 2 89 | center_y = (resized_h - 224) / 2 90 | crop_img = resized_img.crop((center_x, center_y, center_x + 224, center_y + 224)) 91 | 92 | return crop_img 93 | 94 | 95 | def load_images(img_ids, img_root_path): 96 | img_list = sorted(os.listdir(img_root_path)) 97 | 98 | raw_imgs = [] 99 | for img_id in img_ids: 100 | img_path = os.path.join(img_root_path, img_list[img_id]) 101 | img = Image.open(img_path) 102 | if CENTER_CROP: 103 | img = resize_and_center_crop(img) 104 | else: 105 | img = img.resize((224, 224), resample=PIL.Image.BILINEAR) 106 | 107 | raw_imgs.append(img) 108 | 109 | images = preprocess_ds_ncdhw(raw_imgs) 110 | 111 | return images 112 | 113 | 114 | def get_prob(pred): 115 | 116 | pred = pred - pred.max() 117 | pred_exp = np.exp(pred) 118 | 119 | return pred_exp.max()/pred_exp.sum() 120 | 121 | 122 | if __name__ == "__main__": 123 | 124 | parser = argparse.ArgumentParser(description='Do AR inference using TRT') 125 | parser.add_argument('--input_images_folder', type=str, help='input images path', required=True) 126 | parser.add_argument('--trt_engine', type=str, help='trt engine file path', required=True) 127 | parser.add_argument('--center_crop', action="store_true", help='resize the short side of image to 224 and center crop 224x224') 128 | parser.add_argument('--input_2d', action="store_true", help='set if it is a 2d model') 129 | 130 | args = parser.parse_args() 131 | 132 | if args.center_crop: 133 | CENTER_CROP = True 134 | 135 | if args.input_2d: 136 | INPUT_2D = True 137 | 138 | batch_size = 1 139 | engine_file = args.trt_engine 140 | label_map = ["push", "fall_floor", "walk", "run", "ride_bike"] 141 | img_root = args.input_images_folder 142 | batch_cnt = 1 143 | 144 | total_cnt = 0 145 | ac_cnt = 0 146 | 147 | with load_tensorrt_engine(engine_file) as engine: 148 | with engine.create_execution_context() as context: 149 | if INPUT_2D: 150 | context.set_binding_shape(0, (1, 3*SEQ, 224, 224)) 151 | else: 152 | context.set_binding_shape(0, (1, 3, SEQ, 224, 224)) 153 | inputs, outputs, bindings, stream = allocate_buffers(engine, context) 154 | img_ids_list = sample_patch(img_root) 155 | for img_ids in img_ids_list: 156 | images = load_images(img_ids, img_root) 157 | for sample_id in range(batch_size): 158 | batch_images = images 159 | # Hard Coded For explicit_batch and the ONNX model's batch_size = 1 160 | batch_images = batch_images[np.newaxis, :, :, :] 161 | outputs_shape, outputs_data = do_inference(batch=batch_images, context=context, 162 | bindings=bindings, inputs=inputs, 163 | outputs=outputs, stream=stream) 164 | 165 | pred_data = np.squeeze(outputs_data['fc_pred']) 166 | label = label_map[np.argmax(pred_data)] 167 | prob = get_prob(pred_data) 168 | print("{} : {} {}".format(img_ids, label, prob)) 169 | -------------------------------------------------------------------------------- /tao_action_recognition/tensorrt_inference/ar_of_trt_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to deal 5 | # in the Software without restriction, including without limitation the rights 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | # copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all 11 | # copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | # SOFTWARE. 20 | 21 | import argparse 22 | from trt_inference import allocate_buffers, do_inference, load_tensorrt_engine 23 | import numpy as np 24 | import PIL 25 | from PIL import Image 26 | import os 27 | 28 | SEQ = 32 29 | CENTER_CROP = False 30 | INPUT_2D = False 31 | 32 | 33 | def preprocess_ds_ncdhw(batch_img): 34 | batch_img_array = np.array(batch_img, dtype=np.float32) 35 | batch_img_array = ((batch_img_array / 255.0) - 0.5) / 0.5 36 | batch_transpose = np.transpose(batch_img_array, (3, 0, 1, 2)) 37 | if INPUT_2D: 38 | batch_reshape = np.reshape(batch_transpose, (2*SEQ, 224, 224)) 39 | else: 40 | batch_reshape = batch_transpose 41 | 42 | return batch_reshape 43 | 44 | 45 | def test_consecutive_sample(max_sample_cnt, seq_length, sample_rate=1): 46 | """Choose the middle consecutive frames of each video.""" 47 | total_frames_req = seq_length * sample_rate 48 | average_duration = max_sample_cnt - total_frames_req + 1 49 | if average_duration > 0: 50 | start_idx = int(average_duration/2.0) 51 | else: 52 | start_idx = 0 53 | 54 | img_ids = start_idx + np.arange(seq_length) * sample_rate 55 | # # loop the video to form sequence: 56 | img_ids = np.mod(img_ids, max_sample_cnt) 57 | 58 | return img_ids 59 | 60 | 61 | def sample_patch(img_root_path, seq_len=SEQ): 62 | img_list = sorted(os.listdir(os.path.join(img_root_path, "u"))) 63 | img_id_list = [] 64 | if len(img_list) < seq_len: 65 | img_ids = np.arange(seq_len) 66 | img_ids = np.mod(img_ids, len(img_list)) 67 | img_id_list.append(img_ids) 68 | else: 69 | end_index = len(img_list) - seq_len + 1 70 | for idx in range(end_index): 71 | img_ids = idx + np.arange(seq_len) 72 | img_id_list.append(img_ids) 73 | return img_id_list 74 | 75 | 76 | def resize_and_center_crop(img): 77 | # resize the short side to 224 78 | w, h = img.size 79 | if h <= w: 80 | target_w = int((256.0 / float(h)) * w) 81 | resized_img = img.resize((target_w, 224), resample=PIL.Image.BILINEAR) 82 | else: 83 | target_h = int((256.0 / float(w)) * h) 84 | resized_img = img.resize((224, target_h), resample=PIL.Image.BILINEAR) 85 | 86 | # center crop to 224x224 87 | resized_w, resized_h = resized_img.size 88 | center_x = (resized_w - 224) / 2 89 | center_y = (resized_h - 224) / 2 90 | crop_img = resized_img.crop((center_x, center_y, center_x + 224, center_y + 224)) 91 | 92 | return crop_img 93 | 94 | 95 | def load_images(img_ids, img_root_path): 96 | u_root_path = os.path.join(img_root_path, "u") 97 | v_root_path = os.path.join(img_root_path, "v") 98 | u_list = sorted(os.listdir(u_root_path)) 99 | v_list = sorted(os.listdir(v_root_path)) 100 | 101 | raw_imgs = [] 102 | for img_id in img_ids: 103 | u_img_path = os.path.join(u_root_path, u_list[img_id]) 104 | v_img_path = os.path.join(v_root_path, v_list[img_id]) 105 | u_img = Image.open(u_img_path) 106 | v_img = Image.open(v_img_path) 107 | if CENTER_CROP: 108 | u_img = resize_and_center_crop(u_img) 109 | v_img = resize_and_center_crop(v_img) 110 | else: 111 | u_img = u_img.resize((224, 224), resample=PIL.Image.BILINEAR) 112 | v_img = v_img.resize((224, 224), resample=PIL.Image.BILINEAR) 113 | 114 | #stack of 115 | img = np.stack((np.array(u_img), np.array(v_img)), axis=-1) 116 | raw_imgs.append(img) 117 | 118 | images = preprocess_ds_ncdhw(raw_imgs) 119 | 120 | return images 121 | 122 | 123 | def get_prob(pred): 124 | 125 | pred = pred - pred.max() 126 | pred_exp = np.exp(pred) 127 | 128 | return pred_exp.max()/pred_exp.sum() 129 | 130 | 131 | if __name__ == "__main__": 132 | 133 | parser = argparse.ArgumentParser(description='Do AR inference using TRT') 134 | parser.add_argument('--input_images_folder', type=str, help='input images path', required=True) 135 | parser.add_argument('--trt_engine', type=str, help='trt engine file path', required=True) 136 | parser.add_argument('--center_crop', action="store_true", help='resize the short side of image to 224 and center crop 224x224') 137 | parser.add_argument('--input_2d', action="store_true", help='set if it is a 2d model') 138 | 139 | args = parser.parse_args() 140 | 141 | if args.center_crop: 142 | CENTER_CROP = True 143 | 144 | if args.input_2d: 145 | INPUT_2D = True 146 | 147 | batch_size = 1 148 | engine_file = args.trt_engine 149 | label_map = ["push", "fall_floor", "walk", "run", "ride_bike"] 150 | img_root = args.input_images_folder 151 | batch_cnt = 1 152 | 153 | total_cnt = 0 154 | ac_cnt = 0 155 | 156 | with load_tensorrt_engine(engine_file) as engine: 157 | with engine.create_execution_context() as context: 158 | if INPUT_2D: 159 | context.set_binding_shape(0, (1, 2*SEQ, 224, 224)) 160 | else: 161 | context.set_binding_shape(0, (1, 2, SEQ, 224, 224)) 162 | inputs, outputs, bindings, stream = allocate_buffers(engine, context) 163 | for class_name in os.listdir(img_root): 164 | class_path = os.path.join(img_root, class_name) 165 | cls_total= 0 166 | cls_ac = 0 167 | for video in os.listdir(class_path): 168 | total_cnt += 1 169 | cls_total +=1 170 | video_path = os.path.join(class_path, video) 171 | max_sample_cnt = len(os.listdir(os.path.join(video_path, "u"))) 172 | img_ids_list = [test_consecutive_sample(max_sample_cnt, SEQ)] 173 | for img_ids in img_ids_list: 174 | images = load_images(img_ids, video_path) 175 | for sample_id in range(batch_size): 176 | batch_images = images 177 | # Hard Coded For explicit_batch and the ONNX model's batch_size = 1 178 | batch_images = batch_images[np.newaxis, :, :, :] 179 | outputs_shape, outputs_data = do_inference(batch=batch_images, context=context, 180 | bindings=bindings, inputs=inputs, 181 | outputs=outputs, stream=stream) 182 | 183 | pred_data = np.squeeze(outputs_data['fc_pred']) 184 | label = label_map[np.argmax(pred_data)] 185 | if (label==class_name): 186 | ac_cnt += 1 187 | cls_ac +=1 188 | # prob = get_prob(pred_data) 189 | # print("{} : {} {}".format(img_ids, label, prob)) 190 | print("{} : {}".format(class_name, float(cls_ac)/float(cls_total))) 191 | 192 | print("Acc: {}".format(float(ac_cnt)/float(total_cnt))) -------------------------------------------------------------------------------- /tao_pointpillars/tensorrt_sample/src/postprocess.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "postprocess.h" 24 | 25 | #define checkCudaErrors(status) \ 26 | { \ 27 | if (status != 0) \ 28 | { \ 29 | std::cout << "Cuda failure: " << cudaGetErrorString(status) \ 30 | << " at line " << __LINE__ \ 31 | << " in file " << __FILE__ \ 32 | << " error status: " << status \ 33 | << std::endl; \ 34 | abort(); \ 35 | } \ 36 | } 37 | 38 | const float ThresHold = 1e-8; 39 | 40 | inline float cross(const float2 p1, const float2 p2, const float2 p0) { 41 | return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); 42 | } 43 | 44 | inline int check_box2d(const Bndbox box, const float2 p) { 45 | const float MARGIN = 1e-2; 46 | float center_x = box.x; 47 | float center_y = box.y; 48 | float angle_cos = cos(-box.rt); 49 | float angle_sin = sin(-box.rt); 50 | float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); 51 | float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; 52 | 53 | return (fabs(rot_x) < box.l / 2 + MARGIN && fabs(rot_y) < box.w / 2 + MARGIN); 54 | } 55 | 56 | bool intersection(const float2 p1, const float2 p0, const float2 q1, const float2 q0, float2 &ans) { 57 | 58 | if (( std::min(p0.x, p1.x) <= std::max(q0.x, q1.x) && 59 | std::min(q0.x, q1.x) <= std::max(p0.x, p1.x) && 60 | std::min(p0.y, p1.y) <= std::max(q0.y, q1.y) && 61 | std::min(q0.y, q1.y) <= std::max(p0.y, p1.y) ) == 0) 62 | return false; 63 | 64 | 65 | float s1 = cross(q0, p1, p0); 66 | float s2 = cross(p1, q1, p0); 67 | float s3 = cross(p0, q1, q0); 68 | float s4 = cross(q1, p1, q0); 69 | 70 | if (!(s1 * s2 > 0 && s3 * s4 > 0)) 71 | return false; 72 | 73 | float s5 = cross(q1, p1, p0); 74 | if (fabs(s5 - s1) > ThresHold) { 75 | ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); 76 | ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); 77 | 78 | } else { 79 | float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; 80 | float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; 81 | float D = a0 * b1 - a1 * b0; 82 | 83 | ans.x = (b0 * c1 - b1 * c0) / D; 84 | ans.y = (a1 * c0 - a0 * c1) / D; 85 | } 86 | 87 | return true; 88 | } 89 | 90 | inline void rotate_around_center(const float2 ¢er, const float angle_cos, const float angle_sin, float2 &p) { 91 | float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x; 92 | float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; 93 | p = float2 {new_x, new_y}; 94 | } 95 | 96 | inline float box_overlap(const Bndbox &box_a, const Bndbox &box_b) { 97 | float a_angle = box_a.rt, b_angle = box_b.rt; 98 | float a_dx_half = box_a.l / 2, b_dx_half = box_b.l / 2, a_dy_half = box_a.w / 2, b_dy_half = box_b.w / 2; 99 | float a_x1 = box_a.x - a_dx_half, a_y1 = box_a.y - a_dy_half; 100 | float a_x2 = box_a.x + a_dx_half, a_y2 = box_a.y + a_dy_half; 101 | float b_x1 = box_b.x - b_dx_half, b_y1 = box_b.y - b_dy_half; 102 | float b_x2 = box_b.x + b_dx_half, b_y2 = box_b.y + b_dy_half; 103 | float2 box_a_corners[5]; 104 | float2 box_b_corners[5]; 105 | 106 | float2 center_a = float2 {box_a.x, box_a.y}; 107 | float2 center_b = float2 {box_b.x, box_b.y}; 108 | 109 | float2 cross_points[16]; 110 | float2 poly_center = {0, 0}; 111 | int cnt = 0; 112 | bool flag = false; 113 | 114 | box_a_corners[0] = float2 {a_x1, a_y1}; 115 | box_a_corners[1] = float2 {a_x2, a_y1}; 116 | box_a_corners[2] = float2 {a_x2, a_y2}; 117 | box_a_corners[3] = float2 {a_x1, a_y2}; 118 | 119 | box_b_corners[0] = float2 {b_x1, b_y1}; 120 | box_b_corners[1] = float2 {b_x2, b_y1}; 121 | box_b_corners[2] = float2 {b_x2, b_y2}; 122 | box_b_corners[3] = float2 {b_x1, b_y2}; 123 | 124 | float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); 125 | float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); 126 | 127 | for (int k = 0; k < 4; k++) { 128 | rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); 129 | rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); 130 | } 131 | 132 | box_a_corners[4] = box_a_corners[0]; 133 | box_b_corners[4] = box_b_corners[0]; 134 | 135 | for (int i = 0; i < 4; i++) { 136 | for (int j = 0; j < 4; j++) { 137 | flag = intersection(box_a_corners[i + 1], box_a_corners[i], 138 | box_b_corners[j + 1], box_b_corners[j], 139 | cross_points[cnt]); 140 | if (flag) { 141 | poly_center = {poly_center.x + cross_points[cnt].x, poly_center.y + cross_points[cnt].y}; 142 | cnt++; 143 | } 144 | } 145 | } 146 | 147 | for (int k = 0; k < 4; k++) { 148 | if (check_box2d(box_a, box_b_corners[k])) { 149 | poly_center = {poly_center.x + box_b_corners[k].x, poly_center.y + box_b_corners[k].y}; 150 | cross_points[cnt] = box_b_corners[k]; 151 | cnt++; 152 | } 153 | if (check_box2d(box_b, box_a_corners[k])) { 154 | poly_center = {poly_center.x + box_a_corners[k].x, poly_center.y + box_a_corners[k].y}; 155 | cross_points[cnt] = box_a_corners[k]; 156 | cnt++; 157 | } 158 | } 159 | 160 | poly_center.x /= cnt; 161 | poly_center.y /= cnt; 162 | 163 | float2 temp; 164 | for (int j = 0; j < cnt - 1; j++) { 165 | for (int i = 0; i < cnt - j - 1; i++) { 166 | if (atan2(cross_points[i].y - poly_center.y, cross_points[i].x - poly_center.x) > 167 | atan2(cross_points[i+1].y - poly_center.y, cross_points[i+1].x - poly_center.x) 168 | ) { 169 | temp = cross_points[i]; 170 | cross_points[i] = cross_points[i + 1]; 171 | cross_points[i + 1] = temp; 172 | } 173 | } 174 | } 175 | 176 | float area = 0; 177 | for (int k = 0; k < cnt - 1; k++) { 178 | float2 a = {cross_points[k].x - cross_points[0].x, 179 | cross_points[k].y - cross_points[0].y}; 180 | float2 b = {cross_points[k + 1].x - cross_points[0].x, 181 | cross_points[k + 1].y - cross_points[0].y}; 182 | area += (a.x * b.y - a.y * b.x); 183 | } 184 | return fabs(area) / 2.0; 185 | } 186 | 187 | int nms_cpu( 188 | std::vector bndboxes, 189 | const float nms_thresh, 190 | std::vector &nms_pred, 191 | const int pre_nms_top_n) 192 | { 193 | std::sort(bndboxes.begin(), bndboxes.end(), 194 | [](Bndbox boxes1, Bndbox boxes2) { return boxes1.score > boxes2.score; }); 195 | std::vector suppressed(std::min(int(bndboxes.size()), pre_nms_top_n), 0); 196 | for (size_t i = 0; i < std::min(int(bndboxes.size()), pre_nms_top_n); i++) { 197 | if (suppressed[i] == 1) { 198 | continue; 199 | } 200 | nms_pred.emplace_back(bndboxes[i]); 201 | for (size_t j = i + 1; j < std::min(int(bndboxes.size()), pre_nms_top_n); j++) { 202 | if (suppressed[j] == 1) { 203 | continue; 204 | } 205 | float sa = bndboxes[i].l * bndboxes[i].w; 206 | float sb = bndboxes[j].l * bndboxes[j].w; 207 | float s_overlap = box_overlap(bndboxes[i], bndboxes[j]); 208 | float iou = s_overlap / fmaxf(sa + sb - s_overlap, ThresHold); 209 | 210 | if (iou >= nms_thresh) { 211 | suppressed[j] = 1; 212 | } 213 | } 214 | } 215 | return 0; 216 | } 217 | -------------------------------------------------------------------------------- /tao_object_dection/yolov4/specs/yolov4_416_coco14.txt: -------------------------------------------------------------------------------- 1 | random_seed: 42 2 | yolov4_config { 3 | big_anchor_shape: "[(87.73, 65.44),(115.02, 177.14),(288.61, 296.34)]" 4 | mid_anchor_shape: "[(20.78, 55.42),(41.95, 33.66),(43.95, 111.22)]" 5 | small_anchor_shape: "[(5.84, 9.60),(10.24, 27.14),(21.27, 15.56)]" 6 | box_matching_iou: 0.25 7 | matching_neutral_box_iou: 0.5 8 | arch: "cspdarknet" 9 | nlayers: 53 10 | arch_conv_blocks: 2 11 | loss_loc_weight: 1.0 12 | loss_neg_obj_weights: 1.0 13 | loss_class_weights: 1.0 14 | label_smoothing: 0.0 15 | big_grid_xy_extend: 0.05 16 | mid_grid_xy_extend: 0.1 17 | small_grid_xy_extend: 0.2 18 | freeze_bn: false 19 | } 20 | 21 | training_config { 22 | batch_size_per_gpu: 8 23 | num_epochs: 300 24 | enable_qat: false 25 | checkpoint_interval: 1 26 | learning_rate { 27 | soft_start_cosine_annealing_schedule { 28 | min_learning_rate: 1e-5 29 | max_learning_rate: 0.000125 #0.00032625 #0.000435 30 | soft_start: 0.001 31 | } 32 | } 33 | regularizer { 34 | type: L2 35 | weight: 3e-5 36 | } 37 | optimizer { 38 | adam { 39 | epsilon: 1e-7 40 | beta1: 0.9 41 | beta2: 0.999 42 | amsgrad: false 43 | } 44 | } 45 | 46 | n_workers: 8 47 | use_multiprocessing: false 48 | pretrain_model_path: "/workspace_tz/tao_yolov4/cspdarknet_199.tlt" 49 | } 50 | eval_config { 51 | average_precision_mode: INTEGRATE 52 | batch_size: 8 53 | matching_iou_threshold: 0.5 54 | } 55 | nms_config { 56 | confidence_threshold: 0.001 57 | clustering_iou_threshold: 0.6 58 | top_k: 300 59 | force_on_cpu: True 60 | } 61 | augmentation_config { 62 | hue: 0.1 63 | saturation: 1.5 64 | exposure:1.5 65 | vertical_flip:0 66 | horizontal_flip: 0.5 67 | jitter: 0.3 68 | output_width: 416 69 | output_height: 416 70 | randomize_input_shape_period: 10 71 | output_channel: 3 72 | mosaic_prob: 0.5 73 | mosaic_min_ratio:0.2 74 | image_mean { 75 | key: 'b' 76 | value: 103.9 77 | } 78 | image_mean { 79 | key: 'g' 80 | value: 116.8 81 | } 82 | image_mean { 83 | key: 'r' 84 | value: 123.7 85 | } 86 | } 87 | dataset_config { 88 | data_sources: { 89 | label_directory_path: "/raid/KITTI/trainval2014" 90 | image_directory_path: "/raid/images/trainval2014" 91 | } 92 | target_class_mapping { 93 | key: "apple" 94 | value: "apple" 95 | } 96 | target_class_mapping { 97 | key: "book" 98 | value: "book" 99 | } 100 | target_class_mapping { 101 | key: "handbag" 102 | value: "handbag" 103 | } 104 | target_class_mapping { 105 | key: "car" 106 | value: "car" 107 | } 108 | target_class_mapping { 109 | key: "pottedplant" 110 | value: "pottedplant" 111 | } 112 | target_class_mapping { 113 | key: "backpack" 114 | value: "backpack" 115 | } 116 | target_class_mapping { 117 | key: "clock" 118 | value: "clock" 119 | } 120 | target_class_mapping { 121 | key: "truck" 122 | value: "truck" 123 | } 124 | target_class_mapping { 125 | key: "knife" 126 | value: "knife" 127 | } 128 | target_class_mapping { 129 | key: "cup" 130 | value: "cup" 131 | } 132 | target_class_mapping { 133 | key: "snowboard" 134 | value: "snowboard" 135 | } 136 | target_class_mapping { 137 | key: "suitcase" 138 | value: "suitcase" 139 | } 140 | target_class_mapping { 141 | key: "umbrella" 142 | value: "umbrella" 143 | } 144 | target_class_mapping { 145 | key: "bowl" 146 | value: "bowl" 147 | } 148 | target_class_mapping { 149 | key: "carrot" 150 | value: "carrot" 151 | } 152 | target_class_mapping { 153 | key: "person" 154 | value: "person" 155 | } 156 | target_class_mapping { 157 | key: "fork" 158 | value: "fork" 159 | } 160 | target_class_mapping { 161 | key: "train" 162 | value: "train" 163 | } 164 | target_class_mapping { 165 | key: "pizza" 166 | value: "pizza" 167 | } 168 | target_class_mapping { 169 | key: "couch" 170 | value: "couch" 171 | } 172 | target_class_mapping { 173 | key: "bus" 174 | value: "bus" 175 | } 176 | target_class_mapping { 177 | key: "skis" 178 | value: "skis" 179 | } 180 | target_class_mapping { 181 | key: "keyboard" 182 | value: "keyboard" 183 | } 184 | target_class_mapping { 185 | key: "firehydrant" 186 | value: "firehydrant" 187 | } 188 | target_class_mapping { 189 | key: "tennisracket" 190 | value: "tennisracket" 191 | } 192 | target_class_mapping { 193 | key: "sandwich" 194 | value: "sandwich" 195 | } 196 | target_class_mapping { 197 | key: "toothbrush" 198 | value: "toothbrush" 199 | } 200 | target_class_mapping { 201 | key: "motorcycle" 202 | value: "motorcycle" 203 | } 204 | target_class_mapping { 205 | key: "remote" 206 | value: "remote" 207 | } 208 | target_class_mapping { 209 | key: "frisbee" 210 | value: "frisbee" 211 | } 212 | target_class_mapping { 213 | key: "mouse" 214 | value: "mouse" 215 | } 216 | target_class_mapping { 217 | key: "trafficlight" 218 | value: "trafficlight" 219 | } 220 | target_class_mapping { 221 | key: "oven" 222 | value: "oven" 223 | } 224 | target_class_mapping { 225 | key: "scissors" 226 | value: "scissors" 227 | } 228 | target_class_mapping { 229 | key: "airplane" 230 | value: "airplane" 231 | } 232 | target_class_mapping { 233 | key: "teddybear" 234 | value: "teddybear" 235 | } 236 | target_class_mapping { 237 | key: "refrigerator" 238 | value: "refrigerator" 239 | } 240 | target_class_mapping { 241 | key: "stopsign" 242 | value: "stopsign" 243 | } 244 | target_class_mapping { 245 | key: "bed" 246 | value: "bed" 247 | } 248 | target_class_mapping { 249 | key: "orange" 250 | value: "orange" 251 | } 252 | target_class_mapping { 253 | key: "bottle" 254 | value: "bottle" 255 | } 256 | target_class_mapping { 257 | key: "sink" 258 | value: "sink" 259 | } 260 | target_class_mapping { 261 | key: "chair" 262 | value: "chair" 263 | } 264 | target_class_mapping { 265 | key: "broccoli" 266 | value: "broccoli" 267 | } 268 | target_class_mapping { 269 | key: "horse" 270 | value: "horse" 271 | } 272 | target_class_mapping { 273 | key: "elephant" 274 | value: "elephant" 275 | } 276 | target_class_mapping { 277 | key: "tie" 278 | value: "tie" 279 | } 280 | target_class_mapping { 281 | key: "banana" 282 | value: "banana" 283 | } 284 | target_class_mapping { 285 | key: "donut" 286 | value: "donut" 287 | } 288 | target_class_mapping { 289 | key: "baseballglove" 290 | value: "baseballglove" 291 | } 292 | target_class_mapping { 293 | key: "surfboard" 294 | value: "surfboard" 295 | } 296 | target_class_mapping { 297 | key: "hotdog" 298 | value: "hotdog" 299 | } 300 | target_class_mapping { 301 | key: "skateboard" 302 | value: "skateboard" 303 | } 304 | target_class_mapping { 305 | key: "zebra" 306 | value: "zebra" 307 | } 308 | target_class_mapping { 309 | key: "boat" 310 | value: "boat" 311 | } 312 | target_class_mapping { 313 | key: "vase" 314 | value: "vase" 315 | } 316 | target_class_mapping { 317 | key: "baseballbat" 318 | value: "baseballbat" 319 | } 320 | target_class_mapping { 321 | key: "hairdrier" 322 | value: "hairdrier" 323 | } 324 | target_class_mapping { 325 | key: "cake" 326 | value: "cake" 327 | } 328 | target_class_mapping { 329 | key: "diningtable" 330 | value: "diningtable" 331 | } 332 | target_class_mapping { 333 | key: "bicycle" 334 | value: "bicycle" 335 | } 336 | target_class_mapping { 337 | key: "laptop" 338 | value: "laptop" 339 | } 340 | target_class_mapping { 341 | key: "wineglass" 342 | value: "wineglass" 343 | } 344 | target_class_mapping { 345 | key: "bear" 346 | value: "bear" 347 | } 348 | target_class_mapping { 349 | key: "parkingmeter" 350 | value: "parkingmeter" 351 | } 352 | target_class_mapping { 353 | key: "tv" 354 | value: "tv" 355 | } 356 | target_class_mapping { 357 | key: "cat" 358 | value: "cat" 359 | } 360 | target_class_mapping { 361 | key: "bird" 362 | value: "bird" 363 | } 364 | target_class_mapping { 365 | key: "toilet" 366 | value: "toilet" 367 | } 368 | target_class_mapping { 369 | key: "sportsball" 370 | value: "sportsball" 371 | } 372 | target_class_mapping { 373 | key: "sheep" 374 | value: "sheep" 375 | } 376 | target_class_mapping { 377 | key: "microwave" 378 | value: "microwave" 379 | } 380 | target_class_mapping { 381 | key: "cow" 382 | value: "cow" 383 | } 384 | target_class_mapping { 385 | key: "bench" 386 | value: "bench" 387 | } 388 | target_class_mapping { 389 | key: "giraffe" 390 | value: "giraffe" 391 | } 392 | target_class_mapping { 393 | key: "spoon" 394 | value: "spoon" 395 | } 396 | target_class_mapping { 397 | key: "dog" 398 | value: "dog" 399 | } 400 | target_class_mapping { 401 | key: "toaster" 402 | value: "toaster" 403 | } 404 | target_class_mapping { 405 | key: "cellphone" 406 | value: "cellphone" 407 | } 408 | target_class_mapping { 409 | key: "kite" 410 | value: "kite" 411 | } 412 | image_extension: "jpg" 413 | validation_data_sources: { 414 | label_directory_path: "/raid/KITTI/val_5k" 415 | image_directory_path: "/raid/images/val_5k" 416 | } 417 | } 418 | -------------------------------------------------------------------------------- /tao_pointpillars/tensorrt_sample/test/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "cuda_runtime.h" 25 | #include "./pointpillar.h" 26 | 27 | #include 28 | 29 | #define checkCudaErrors(status) \ 30 | { \ 31 | if (status != 0) \ 32 | { \ 33 | std::cout << "Cuda failure: " << cudaGetErrorString(status) \ 34 | << " at line " << __LINE__ \ 35 | << " in file " << __FILE__ \ 36 | << " error status: " << status \ 37 | << std::endl; \ 38 | abort(); \ 39 | } \ 40 | } 41 | 42 | int loadData(const char *file, void **data, unsigned int *length) 43 | { 44 | std::fstream dataFile(file, std::ifstream::in); 45 | 46 | if (!dataFile.is_open()) 47 | { 48 | std::cout << "Can't open files: "<< file<& ret, // NOLINT(runtime/references) 78 | char del = ',') { 79 | int idx = 0; 80 | auto p = std::string(s + idx).find(std::string(1, del)); 81 | while (std::string::npos != p) { 82 | auto s_tmp = std::string(s + idx).substr(0, p); 83 | ret.push_back(s_tmp); 84 | idx += (p + 1); 85 | p = std::string(s + idx).find(std::string(1, del)); 86 | } 87 | if (s[idx] != 0) { 88 | ret.push_back(std::string(s + idx)); 89 | } 90 | } 91 | 92 | void parse_args( 93 | int argc, char**argv, 94 | std::vector& class_names, 95 | float& nms_iou_thresh, 96 | int& pre_nms_top_n, 97 | bool& do_profile, 98 | std::string& model_path, 99 | std::string& engine_path, 100 | std::string& data_path, 101 | std::string& data_type, 102 | std::string& output_path 103 | ) { 104 | int c; 105 | while ((c = getopt(argc, argv, "c:n:t:m:l:d:e:o:ph")) != -1) { 106 | switch (c) { 107 | case 't': 108 | { 109 | nms_iou_thresh = atof(optarg); 110 | break; 111 | } 112 | case 'n': 113 | { 114 | pre_nms_top_n = atoi(optarg); 115 | break; 116 | } 117 | case 'c': 118 | { 119 | split_str(optarg, class_names); 120 | break; 121 | } 122 | case 'm': 123 | { 124 | model_path = std::string(optarg); 125 | break; 126 | } 127 | case 'e': 128 | { 129 | engine_path = std::string(optarg); 130 | break; 131 | } 132 | case 'l': 133 | { 134 | data_path = std::string(optarg); 135 | break; 136 | } 137 | case 'o': 138 | { 139 | output_path = std::string(optarg); 140 | break; 141 | } 142 | case 'd': 143 | { 144 | data_type = std::string(optarg); 145 | break; 146 | } 147 | case 'p': 148 | { 149 | do_profile = true; 150 | break; 151 | } 152 | case 'h': 153 | { 154 | std::cout << "Usage: " << std::endl; 155 | std::cout << argv[0] << " -t " << 156 | " -c -n " << 157 | " -l -m " << 158 | " -e -d -o -p -h" << 159 | std::endl; 160 | exit(1); 161 | } 162 | default: 163 | { 164 | std::cerr << "Unrecognized argument" << std::endl; 165 | abort(); 166 | } 167 | } 168 | } 169 | } 170 | 171 | std::vector class_names; 172 | float nms_iou_thresh; 173 | int pre_nms_top_n; 174 | bool do_profile{false}; 175 | std::string model_path; 176 | std::string engine_path; 177 | std::string data_path; 178 | std::string data_type{"fp32"}; 179 | std::string output_path; 180 | 181 | 182 | void SaveBoxPred(std::vector boxes, std::string file_name) 183 | { 184 | std::ofstream ofs; 185 | ofs.open(file_name, std::ios::out); 186 | if (ofs.is_open()) { 187 | for (const auto box : boxes) { 188 | ofs << box.x << " "; 189 | ofs << box.y << " "; 190 | ofs << box.z << " "; 191 | ofs << box.w << " "; 192 | ofs << box.l << " "; 193 | ofs << box.h << " "; 194 | ofs << box.rt << " "; 195 | ofs << box.id << " "; 196 | ofs << box.score << " "; 197 | ofs << "\n"; 198 | } 199 | } 200 | else { 201 | std::cerr << "Output file cannot be opened!" << std::endl; 202 | } 203 | ofs.close(); 204 | std::cout << "Saved prediction in: " << file_name << std::endl; 205 | return; 206 | }; 207 | 208 | 209 | int main(int argc, char **argv) 210 | { 211 | parse_args( 212 | argc, argv, 213 | class_names, 214 | nms_iou_thresh, 215 | pre_nms_top_n, 216 | do_profile, 217 | model_path, 218 | engine_path, 219 | data_path, 220 | data_type, 221 | output_path 222 | ); 223 | assert(data_type == "fp32" || data_type == "fp16"); 224 | std::cout << "Loading Data: " << data_path << std::endl; 225 | cudaEvent_t start, stop; 226 | float elapsedTime = 0.0f; 227 | cudaStream_t stream = NULL; 228 | 229 | checkCudaErrors(cudaEventCreate(&start)); 230 | checkCudaErrors(cudaEventCreate(&stop)); 231 | checkCudaErrors(cudaStreamCreate(&stream)); 232 | 233 | std::vector nms_pred; 234 | nms_pred.reserve(100); 235 | 236 | PointPillar pointpillar(model_path, engine_path, stream, data_type); 237 | 238 | 239 | std::string dataFile = data_path; 240 | //load points cloud 241 | unsigned int length = 0; 242 | void *data = NULL; 243 | std::shared_ptr buffer((char *)data, std::default_delete()); 244 | loadData(dataFile.data(), &data, &length); 245 | buffer.reset((char *)data); 246 | 247 | float* points = (float*)buffer.get(); 248 | unsigned int num_point_values = pointpillar.getPointSize(); 249 | unsigned int points_size = length/sizeof(float)/num_point_values; 250 | 251 | float *points_data = nullptr; 252 | unsigned int *points_num = nullptr; 253 | unsigned int points_data_size = points_size * num_point_values * sizeof(float); 254 | 255 | checkCudaErrors(cudaMallocManaged((void **)&points_data, points_data_size)); 256 | checkCudaErrors(cudaMallocManaged((void **)&points_num, sizeof(unsigned int))); 257 | checkCudaErrors(cudaMemcpy(points_data, points, points_data_size, cudaMemcpyDefault)); 258 | checkCudaErrors(cudaMemcpy(points_num, &points_size, sizeof(unsigned int), cudaMemcpyDefault)); 259 | checkCudaErrors(cudaDeviceSynchronize()); 260 | 261 | cudaEventRecord(start, stream); 262 | 263 | pointpillar.doinfer( 264 | points_data, points_num, nms_pred, 265 | nms_iou_thresh, 266 | pre_nms_top_n, 267 | class_names, 268 | do_profile 269 | ); 270 | cudaEventRecord(stop, stream); 271 | cudaEventSynchronize(stop); 272 | cudaEventElapsedTime(&elapsedTime, start, stop); 273 | std::cout<<"TIME: pointpillar: "<< elapsedTime <<" ms." <>>>>>>>>>>" < 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "cuda_runtime.h" 25 | #include "NvInfer.h" 26 | #include "NvOnnxConfig.h" 27 | #include "NvOnnxParser.h" 28 | #include "NvInferRuntime.h" 29 | #include "NvInferPlugin.h" 30 | #include "pointpillar.h" 31 | 32 | #define checkCudaErrors(status) \ 33 | { \ 34 | if (status != 0) \ 35 | { \ 36 | std::cout << "Cuda failure: " << cudaGetErrorString(status) \ 37 | << " at line " << __LINE__ \ 38 | << " in file " << __FILE__ \ 39 | << " error status: " << status \ 40 | << std::endl; \ 41 | abort(); \ 42 | } \ 43 | } 44 | 45 | 46 | struct SimpleProfiler : public nvinfer1::IProfiler 47 | { 48 | struct Record 49 | { 50 | float time{0}; 51 | int count{0}; 52 | }; 53 | 54 | virtual void reportLayerTime(const char* layerName, float ms) noexcept 55 | { 56 | mProfile[layerName].count++; 57 | mProfile[layerName].time += ms; 58 | if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) 59 | { 60 | mLayerNames.push_back(layerName); 61 | } 62 | } 63 | 64 | SimpleProfiler(const char* name, const std::vector& srcProfilers = std::vector()) 65 | : mName(name) 66 | { 67 | for (const auto& srcProfiler : srcProfilers) 68 | { 69 | for (const auto& rec : srcProfiler.mProfile) 70 | { 71 | auto it = mProfile.find(rec.first); 72 | if (it == mProfile.end()) 73 | { 74 | mProfile.insert(rec); 75 | } 76 | else 77 | { 78 | it->second.time += rec.second.time; 79 | it->second.count += rec.second.count; 80 | } 81 | } 82 | } 83 | } 84 | 85 | friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value) 86 | { 87 | out << "========== " << value.mName << " profile ==========" << std::endl; 88 | float totalTime = 0; 89 | std::string layerNameStr = "TensorRT layer name"; 90 | int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); 91 | for (const auto& elem : value.mProfile) 92 | { 93 | totalTime += elem.second.time; 94 | maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); 95 | } 96 | 97 | auto old_settings = out.flags(); 98 | auto old_precision = out.precision(); 99 | // Output header 100 | { 101 | out << std::setw(maxLayerNameLength) << layerNameStr << " "; 102 | out << std::setw(12) << "Runtime, " 103 | << "%" 104 | << " "; 105 | out << std::setw(12) << "Invocations" 106 | << " "; 107 | out << std::setw(12) << "Runtime, ms" << std::endl; 108 | } 109 | for (size_t i = 0; i < value.mLayerNames.size(); i++) 110 | { 111 | const std::string layerName = value.mLayerNames[i]; 112 | auto elem = value.mProfile.at(layerName); 113 | out << std::setw(maxLayerNameLength) << layerName << " "; 114 | out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" 115 | << " "; 116 | out << std::setw(12) << elem.count << " "; 117 | out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl; 118 | } 119 | out.flags(old_settings); 120 | out.precision(old_precision); 121 | out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl; 122 | 123 | return out; 124 | } 125 | 126 | private: 127 | std::string mName; 128 | std::vector mLayerNames; 129 | std::map mProfile; 130 | }; 131 | 132 | 133 | TRT::~TRT(void) 134 | { 135 | context->destroy(); 136 | engine->destroy(); 137 | checkCudaErrors(cudaEventDestroy(start)); 138 | checkCudaErrors(cudaEventDestroy(stop)); 139 | } 140 | 141 | TRT::TRT( 142 | std::string modelFile, 143 | std::string modelCache, 144 | cudaStream_t stream, 145 | const std::string& data_type 146 | ):stream_(stream) 147 | { 148 | initLibNvInferPlugins(&gLogger_, ""); 149 | std::fstream trtCache(modelCache, std::ifstream::in); 150 | checkCudaErrors(cudaEventCreate(&start)); 151 | checkCudaErrors(cudaEventCreate(&stop)); 152 | if (!trtCache.is_open()) 153 | { 154 | std::cout << "Loading Model: " << modelFile << std::endl; 155 | std::cout << "Building TRT engine from the model."<(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); 161 | auto network = (builder->createNetworkV2(explicitBatch)); 162 | 163 | // define onnxparser 164 | auto parser = (nvonnxparser::createParser(*network, gLogger_)); 165 | if (!parser->parseFromFile(modelFile.data(), static_cast(nvinfer1::ILogger::Severity::kWARNING))) 166 | { 167 | std::cerr << ": Failed to parse onnx model file, please check the onnx version and trt support op!" 168 | << std::endl; 169 | exit(-1); 170 | } 171 | // dynamic shape 172 | nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile(); 173 | // define config 174 | auto networkConfig = builder->createBuilderConfig(); 175 | if(data_type == "fp16") { 176 | networkConfig->setFlag(nvinfer1::BuilderFlag::kFP16); 177 | std::cout << "Enabled FP16 data type!" << std::endl; 178 | } 179 | nvinfer1::Dims dims{}; 180 | dims.nbDims = 3; 181 | dims.d[0] = 1; 182 | auto input0_dims = network->getInput(0)->getDimensions(); 183 | dims.d[1] = input0_dims.d[1]; 184 | dims.d[2] = 4; 185 | profile->setDimensions("points", nvinfer1::OptProfileSelector::kMIN, dims); 186 | profile->setDimensions("points", nvinfer1::OptProfileSelector::kOPT, dims); 187 | profile->setDimensions("points", nvinfer1::OptProfileSelector::kMAX, dims); 188 | dims.nbDims = 1; 189 | dims.d[0] = 1; 190 | profile->setDimensions("num_points", nvinfer1::OptProfileSelector::kMIN, dims); 191 | profile->setDimensions("num_points", nvinfer1::OptProfileSelector::kOPT, dims); 192 | profile->setDimensions("num_points", nvinfer1::OptProfileSelector::kMAX, dims); 193 | networkConfig->addOptimizationProfile(profile); 194 | // set max workspace 195 | networkConfig->setMaxWorkspaceSize(size_t(1) << 30); 196 | 197 | engine = (builder->buildEngineWithConfig(*network, *networkConfig)); 198 | 199 | if (engine == nullptr) 200 | { 201 | std::cerr << ": engine init null!" << std::endl; 202 | exit(-1); 203 | } 204 | 205 | // serialize the engine, then close everything down 206 | auto trtModelStream = (engine->serialize()); 207 | std::string modelCacheSave = modelFile + ".cache"; 208 | std::fstream trtOut(modelCacheSave, std::ifstream::out); 209 | if (!trtOut.is_open()) 210 | { 211 | std::cout << "Can't store trt cache.\n"; 212 | exit(-1); 213 | } 214 | trtOut.write((char*)trtModelStream->data(), trtModelStream->size()); 215 | trtOut.close(); 216 | trtModelStream->destroy(); 217 | 218 | networkConfig->destroy(); 219 | parser->destroy(); 220 | network->destroy(); 221 | builder->destroy(); 222 | } else { 223 | std::cout << "Loading existing TRT Engine: " 224 | << modelCache 225 | << std::endl; 226 | char *data; 227 | unsigned int length; 228 | // get length of file: 229 | trtCache.seekg(0, trtCache.end); 230 | length = trtCache.tellg(); 231 | trtCache.seekg(0, trtCache.beg); 232 | data = (char *)malloc(length); 233 | if (data == NULL ) { 234 | std::cout << "Can't malloc data.\n"; 235 | exit(-1); 236 | } 237 | trtCache.read(data, length); 238 | // create context 239 | auto runtime = nvinfer1::createInferRuntime(gLogger_); 240 | if (runtime == nullptr) { 241 | std::cerr << ": runtime null!" << std::endl; 242 | exit(-1); 243 | } 244 | engine = (runtime->deserializeCudaEngine(data, length, 0)); 245 | if (engine == nullptr) { 246 | std::cerr << ": engine null!" << std::endl; 247 | exit(-1); 248 | } 249 | free(data); 250 | trtCache.close(); 251 | } 252 | 253 | context = engine->createExecutionContext(); 254 | 255 | } 256 | 257 | int TRT::doinfer(void**buffers, bool do_profile) 258 | { 259 | int status; 260 | SimpleProfiler profiler("perf"); 261 | if(do_profile) 262 | context->setProfiler(&profiler); 263 | status = context->enqueueV2(buffers, stream_, &start); 264 | if(do_profile) 265 | std::cout << profiler; 266 | if (!status) 267 | { 268 | return false; 269 | } 270 | return true; 271 | } 272 | 273 | nvinfer1::Dims TRT::get_binding_shape(int index) 274 | { 275 | return context->getBindingDimensions(index); 276 | } 277 | 278 | int TRT::getPointSize() { 279 | return context->getBindingDimensions(0).d[2]; 280 | } 281 | 282 | PointPillar::PointPillar( 283 | std::string modelFile, 284 | std::string engineFile, 285 | cudaStream_t stream, 286 | const std::string& data_type 287 | ):stream_(stream) 288 | { 289 | 290 | checkCudaErrors(cudaEventCreate(&start)); 291 | checkCudaErrors(cudaEventCreate(&stop)); 292 | 293 | trt_.reset(new TRT(modelFile, engineFile, stream_, data_type)); 294 | 295 | //output of TRT 296 | box_size = (trt_->get_binding_shape(2).d[1]) * 9 * sizeof(float); 297 | checkCudaErrors(cudaMallocManaged((void **)&box_output, box_size)); 298 | checkCudaErrors(cudaMallocManaged((void **)&box_num, sizeof(int))); 299 | res.reserve(100); 300 | } 301 | 302 | PointPillar::~PointPillar(void) 303 | { 304 | trt_.reset(); 305 | 306 | checkCudaErrors(cudaFree(box_output)); 307 | checkCudaErrors(cudaFree(box_num)); 308 | checkCudaErrors(cudaEventDestroy(start)); 309 | checkCudaErrors(cudaEventDestroy(stop)); 310 | } 311 | 312 | int PointPillar::getPointSize() { 313 | return trt_->getPointSize(); 314 | } 315 | 316 | int PointPillar::doinfer( 317 | void*points_data, 318 | unsigned int* points_size, 319 | std::vector &nms_pred, 320 | float nms_iou_thresh, 321 | int pre_nms_top_n, 322 | std::vector& class_names, 323 | bool do_profile 324 | ) 325 | { 326 | #if PERFORMANCE_LOG 327 | float doinferTime = 0.0f; 328 | cudaEventRecord(start, stream_); 329 | #endif 330 | void *buffers[] = {points_data, points_size, box_output, box_num}; 331 | 332 | trt_->doinfer(buffers, do_profile); 333 | 334 | #if PERFORMANCE_LOG 335 | checkCudaErrors(cudaEventRecord(stop, stream_)); 336 | checkCudaErrors(cudaEventSynchronize(stop)); 337 | checkCudaErrors(cudaEventElapsedTime(&doinferTime, start, stop)); 338 | std::cout<<"TIME: doinfer: "<< doinferTime <<" ms." < 30 | #if CV_MAJOR_VERSION >= 3 31 | # include 32 | # include 33 | #else 34 | # include 35 | #endif 36 | 37 | #include 38 | #include 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | 49 | #include 50 | #include 51 | #include 52 | #include 53 | 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | 60 | #define TAG_STRING "PIEH" // use this when WRITING the file 61 | 62 | #define CHECK_STATUS(STMT) \ 63 | do \ 64 | { \ 65 | VPIStatus status = (STMT); \ 66 | if (status != VPI_SUCCESS) \ 67 | { \ 68 | char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \ 69 | vpiGetLastStatusMessage(buffer, sizeof(buffer)); \ 70 | std::ostringstream ss; \ 71 | ss << vpiStatusGetName(status) << ": " << buffer; \ 72 | throw std::runtime_error(ss.str()); \ 73 | } \ 74 | } while (0); 75 | 76 | static std::string generateRegexPattern(const std::string& imageNamePattern) 77 | { 78 | std::string regex_pat; 79 | std::string image; 80 | std::string temp; 81 | 82 | for (auto it = imageNamePattern.cbegin(); it != imageNamePattern.cend(); ++it) 83 | { 84 | if (*it == '*') 85 | { 86 | image.append(".*"); 87 | } 88 | else if (*it == '?') 89 | { 90 | image.append("."); 91 | } 92 | else 93 | { 94 | image.append(1, *it); 95 | } 96 | } 97 | 98 | size_t pos = image.find_first_of("%"); 99 | if (pos != std::string::npos) 100 | { 101 | if (pos > 0) 102 | { 103 | regex_pat.append(image.substr(0, pos)); 104 | } 105 | temp = image.substr(pos + 1); 106 | pos = temp.find_first_of("d"); 107 | if (pos != std::string::npos) 108 | { 109 | if (pos > 0) 110 | { 111 | auto nd = atoi(temp.substr(0, pos).c_str()); 112 | std::ostringstream ss; 113 | ss << "([0-9]){" << nd << ",}"; 114 | regex_pat.append(ss.str()); 115 | } 116 | else 117 | { 118 | regex_pat.append("([0 - 9]){1,}"); 119 | } 120 | regex_pat.append(temp.substr(pos + 1)); 121 | } 122 | } 123 | else 124 | { 125 | regex_pat.append(image); 126 | } 127 | return regex_pat; 128 | } 129 | 130 | static std::vector> ReadDirectory(const std::string& path) 131 | { 132 | std::vector> files; 133 | DIR* d; 134 | struct dirent* dir; 135 | d = opendir(path.c_str()); 136 | if (d) 137 | { 138 | while ((dir = readdir(d)) != NULL) 139 | { 140 | const char* name = dir->d_name; 141 | if ((name[0] == 0) || 142 | (name[0] == '.' && name[1] == 0) || 143 | (name[0] == '.' && name[1] == '.' && name[2] == 0)) 144 | continue; 145 | 146 | struct stat buf; 147 | if ((stat(name, &buf) == 0) && 148 | S_ISDIR(buf.st_mode)) 149 | continue; 150 | 151 | files.push_back(std::make_pair(path + "/" + std::string(name), std::string(name))); 152 | } 153 | 154 | closedir(d); 155 | } 156 | 157 | return files; 158 | } 159 | 160 | static void glob(const std::string& image, std::vector& result) 161 | { 162 | const char dir_separators[] = "/\\"; 163 | std::string wildchart; 164 | std::string path; 165 | size_t pos = image.find_last_of(dir_separators); 166 | if (pos == std::string::npos) 167 | { 168 | wildchart = image; 169 | path = "."; 170 | } 171 | else 172 | { 173 | path = image.substr(0, pos); 174 | wildchart = image.substr(pos + 1); 175 | } 176 | std::string regex_str = generateRegexPattern(wildchart); 177 | std::regex regex_pat{ regex_str }; 178 | #ifndef NDEBUG 179 | std::cout << "Input file directory path : " << path << std::endl; 180 | std::cout << "Input file pattern : " << wildchart << std::endl; 181 | #endif 182 | std::vector> fileNames = ReadDirectory(path); 183 | for (const auto & p : fileNames) 184 | { 185 | if (!p.first.empty() && !p.second.empty()) 186 | { 187 | auto fileName = p.second; 188 | if (!wildchart.empty()) 189 | { 190 | if (regex_match(fileName, regex_pat)) 191 | { 192 | result.push_back(p.first); 193 | } 194 | } 195 | } 196 | } 197 | 198 | if (!result.empty()) 199 | { 200 | std::sort(result.begin(), result.end()); 201 | } 202 | } 203 | 204 | static void ProcessMotionVector(VPIImage mvImg, cv::Mat &outputImage) 205 | { 206 | // Lock the input image to access it from CPU 207 | VPIImageData mvData; 208 | CHECK_STATUS(vpiImageLock(mvImg, VPI_LOCK_READ, &mvData)); 209 | 210 | // Create a cv::Mat that points to the input image data 211 | cv::Mat mvImage; 212 | CHECK_STATUS(vpiImageDataExportOpenCVMat(mvData, &mvImage)); 213 | 214 | // Convert S10.5 format to float 215 | cv::Mat flow(mvImage.size(), CV_32FC2); 216 | mvImage.convertTo(flow, CV_32F, 1.0f / (1 << 5)); 217 | 218 | // Image not needed anymore, we can unlock it. 219 | CHECK_STATUS(vpiImageUnlock(mvImg)); 220 | 221 | outputImage = flow; 222 | 223 | } 224 | 225 | static void WriteFlowVectors(const std::string& outputFilePattern, 226 | const int frameIdx, 227 | const cv::Mat& outputImage, 228 | const int mvWidth, 229 | const int mvHeight) 230 | { 231 | std::ostringstream fileName; 232 | fileName << outputFilePattern << "_"; 233 | fileName << std::setw(5) << std::setfill('0') << frameIdx << std::string("_middlebury.flo") ; 234 | 235 | std::ofstream fpOut(fileName.str(), std::ios::out | std::ios::binary); 236 | 237 | fpOut << TAG_STRING; 238 | 239 | fpOut.write((char*)(&mvWidth), sizeof(uint32_t)); 240 | fpOut.write((char*)(&mvHeight), sizeof(uint32_t)); 241 | fpOut.write((char*)outputImage.data, sizeof(float) * mvWidth * mvHeight * 2); 242 | fpOut.close(); 243 | } 244 | 245 | int main(int argc, char *argv[]) 246 | { 247 | // OpenCV image that will be wrapped by a VPIImage. 248 | // Define it here so that it's destroyed *after* wrapper is destroyed 249 | cv::Mat cvPrevFrame, cvCurFrame; 250 | 251 | // VPI objects that will be used 252 | VPIStream stream = NULL; 253 | VPIImage imgPrevFramePL = NULL; 254 | VPIImage imgPrevFrameTmp = NULL; 255 | VPIImage imgPrevFrameBL = NULL; 256 | VPIImage imgCurFramePL = NULL; 257 | VPIImage imgCurFrameTmp = NULL; 258 | VPIImage imgCurFrameBL = NULL; 259 | VPIImage imgMotionVecBL = NULL; 260 | VPIPayload payload = NULL; 261 | 262 | int retval = 0; 263 | 264 | try 265 | { 266 | if (argc != 4) 267 | { 268 | std::cout< "); 270 | } 271 | 272 | // Parse input parameters 273 | std::string strInputFilesPattern = argv[1]; 274 | std::string strOuputFilesPattern = argv[2]; 275 | std::string strQuality = argv[3]; 276 | 277 | VPIOpticalFlowQuality quality; 278 | if (strQuality == "low") 279 | { 280 | quality = VPI_OPTICAL_FLOW_QUALITY_LOW; 281 | } 282 | else if (strQuality == "medium") 283 | { 284 | quality = VPI_OPTICAL_FLOW_QUALITY_MEDIUM; 285 | } 286 | else if (strQuality == "high") 287 | { 288 | quality = VPI_OPTICAL_FLOW_QUALITY_HIGH; 289 | } 290 | else 291 | { 292 | throw std::runtime_error("Unknown quality provided"); 293 | } 294 | 295 | VPIBackend backend; 296 | backend = VPI_BACKEND_NVENC; 297 | // Load the files list 298 | std::vector inputFilesList; 299 | glob(strInputFilesPattern, inputFilesList); 300 | 301 | // Create the stream where processing will happen. We'll use user-provided backend 302 | // for Optical Flow, and CUDA/VIC for image format conversions. 303 | CHECK_STATUS(vpiStreamCreate(backend | VPI_BACKEND_CUDA | VPI_BACKEND_VIC, &stream)); 304 | 305 | cvPrevFrame = cv::imread(inputFilesList[0]); 306 | 307 | // Create the previous and current frame wrapper using the first frame. This wrapper will 308 | // be set to point to every new frame in the main loop. 309 | CHECK_STATUS(vpiImageCreateOpenCVMatWrapper(cvPrevFrame, 0, &imgPrevFramePL)); 310 | CHECK_STATUS(vpiImageCreateOpenCVMatWrapper(cvPrevFrame, 0, &imgCurFramePL)); 311 | 312 | // Define the image formats we'll use throughout this sample. 313 | VPIImageFormat imgFmt = VPI_IMAGE_FORMAT_NV12_ER; 314 | VPIImageFormat imgFmtBL = VPI_IMAGE_FORMAT_NV12_ER_BL; 315 | 316 | int32_t width = cvPrevFrame.cols; 317 | int32_t height = cvPrevFrame.rows; 318 | 319 | // Create Dense Optical Flow payload to be executed on the given backend 320 | CHECK_STATUS(vpiCreateOpticalFlowDense(backend, width, height, imgFmtBL, quality, &payload)); 321 | 322 | // The Dense Optical Flow on NVENC backend expects input to be in block-linear format. 323 | // Since Convert Image Format algorithm doesn't currently support direct BGR 324 | // pitch-linear (from OpenCV) to NV12 block-linear conversion, it must be done in two 325 | // passes, first from BGR/PL to NV12/PL using CUDA, then from NV12/PL to NV12/BL using VIC. 326 | // The temporary image buffer below will store the intermediate NV12/PL representation. 327 | CHECK_STATUS(vpiImageCreate(width, height, imgFmt, 0, &imgPrevFrameTmp)); 328 | CHECK_STATUS(vpiImageCreate(width, height, imgFmt, 0, &imgCurFrameTmp)); 329 | 330 | // Now create the final block-linear buffer that'll be used as input to the 331 | // algorithm. 332 | CHECK_STATUS(vpiImageCreate(width, height, imgFmtBL, 0, &imgPrevFrameBL)); 333 | CHECK_STATUS(vpiImageCreate(width, height, imgFmtBL, 0, &imgCurFrameBL)); 334 | 335 | // Motion vector image width and height, align to be multiple of 4 336 | int32_t mvWidth = (width + 3) / 4; 337 | int32_t mvHeight = (height + 3) / 4; 338 | 339 | 340 | // Create the output motion vector buffer 341 | CHECK_STATUS(vpiImageCreate(mvWidth, mvHeight, VPI_IMAGE_FORMAT_2S16_BL, 0, &imgMotionVecBL)); 342 | 343 | // First convert the first frame to NV12_BL. It'll be used as previous frame when the algorithm is called. 344 | CHECK_STATUS(vpiSubmitConvertImageFormat(stream, VPI_BACKEND_CUDA, imgPrevFramePL, imgPrevFrameTmp, nullptr)); 345 | CHECK_STATUS(vpiSubmitConvertImageFormat(stream, VPI_BACKEND_VIC, imgPrevFrameTmp, imgPrevFrameBL, nullptr)); 346 | 347 | // Create a output image which holds the rendered motion vector image. 348 | cv::Mat mvOutputImage; 349 | 350 | // Fetch a new frame until video ends 351 | int idxFrame = 1; 352 | int outIdxFrame = 0; 353 | for(idxFrame = 1; idxFrame < inputFilesList.size(); idxFrame++) 354 | { 355 | printf("Processing frame %d\n", idxFrame); 356 | cvCurFrame = cv::imread(inputFilesList[idxFrame]); 357 | // Wrap frame into a VPIImage, reusing the existing imgCurFramePL. 358 | CHECK_STATUS(vpiImageSetWrappedOpenCVMat(imgCurFramePL, cvCurFrame)); 359 | 360 | // Convert current frame to NV12_BL format 361 | CHECK_STATUS(vpiSubmitConvertImageFormat(stream, VPI_BACKEND_CUDA, imgCurFramePL, imgCurFrameTmp, nullptr)); 362 | CHECK_STATUS(vpiSubmitConvertImageFormat(stream, VPI_BACKEND_VIC, imgCurFrameTmp, imgCurFrameBL, nullptr)); 363 | 364 | CHECK_STATUS( 365 | vpiSubmitOpticalFlowDense(stream, backend, payload, imgPrevFrameBL, imgCurFrameBL, imgMotionVecBL)); 366 | 367 | // Wait for processing to finish. 368 | CHECK_STATUS(vpiStreamSync(stream)); 369 | 370 | // Render the resulting motion vector in the output image 371 | ProcessMotionVector(imgMotionVecBL, mvOutputImage); 372 | 373 | // Save to output files: 374 | WriteFlowVectors(strOuputFilesPattern, outIdxFrame++, mvOutputImage, mvWidth, mvHeight); 375 | 376 | // Swap previous frame and next frame 377 | std::swap(cvPrevFrame, cvCurFrame); 378 | std::swap(imgPrevFramePL, imgCurFramePL); 379 | std::swap(imgPrevFrameBL, imgCurFrameBL); 380 | } 381 | } 382 | catch (std::exception &e) 383 | { 384 | std::cerr << e.what() << std::endl; 385 | retval = 1; 386 | } 387 | 388 | // Destroy all resources used 389 | vpiStreamDestroy(stream); 390 | vpiPayloadDestroy(payload); 391 | 392 | vpiImageDestroy(imgPrevFramePL); 393 | vpiImageDestroy(imgPrevFrameTmp); 394 | vpiImageDestroy(imgPrevFrameBL); 395 | vpiImageDestroy(imgCurFramePL); 396 | vpiImageDestroy(imgCurFrameTmp); 397 | vpiImageDestroy(imgCurFrameBL); 398 | vpiImageDestroy(imgMotionVecBL); 399 | 400 | return retval; 401 | } 402 | 403 | // vim: ts=8:sw=4:sts=4:et:ai 404 | --------------------------------------------------------------------------------