├── .gitignore ├── LICENSE ├── code ├── CMakeLists.txt ├── include │ ├── EntroyCalibrator.h │ ├── PluginFactory.h │ ├── TrtNet.h │ ├── UpsampleLayer.h │ ├── Utils.h │ ├── YoloConfigs.h │ └── YoloLayer.h └── src │ ├── EntroyCalibrator.cpp │ ├── TrtNet.cpp │ ├── UpsampleLayer.cpp │ ├── UpsampleLayer.cu │ └── YoloLayer.cu ├── readme.md └── sample ├── CMakeLists.txt ├── argsParser.h ├── configs.h ├── dataReader.cpp ├── dataReader.h ├── eval.cpp ├── eval.h ├── evalClassifNet.cpp ├── runNet.cpp └── runTwoNets.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | ## General 2 | # Visual Studio Code files 3 | .vscode 4 | 5 | # build 6 | build 7 | 8 | # install 9 | install -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 lewes6369 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(trtNet) 3 | 4 | set(CMAKE_BUILD_TYPE Release) 5 | 6 | #include 7 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 8 | 9 | #src 10 | set(PLUGIN_SOURCES 11 | src/EntroyCalibrator.cpp 12 | src/UpsampleLayer.cpp 13 | src/UpsampleLayer.cu 14 | src/YoloLayer.cu 15 | src/TrtNet.cpp 16 | ) 17 | 18 | # 19 | # CUDA Configuration 20 | # 21 | find_package(CUDA REQUIRED) 22 | 23 | set(CUDA_VERBOSE_BUILD ON) 24 | 25 | # Specify the cuda host compiler to use the same compiler as cmake. 26 | set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) 27 | 28 | # TensorRT 29 | find_path(TENSORRT_INCLUDE_DIR NvInfer.h 30 | HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR} 31 | PATH_SUFFIXES include) 32 | MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}") 33 | find_library(TENSORRT_LIBRARY_INFER nvinfer 34 | HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} 35 | PATH_SUFFIXES lib lib64 lib/x64) 36 | find_library(TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin 37 | HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} 38 | PATH_SUFFIXES lib lib64 lib/x64) 39 | find_library(TENSORRT_LIBRARY_PARSER nvparsers 40 | HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR} 41 | PATH_SUFFIXES lib lib64 lib/x64) 42 | set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN} ${TENSORRT_LIBRARY_PARSER}) 43 | MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}") 44 | find_package_handle_standard_args( 45 | TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIBRARY) 46 | if(NOT TENSORRT_FOUND) 47 | message(ERROR 48 | "Cannot find TensorRT library.") 49 | endif() 50 | 51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") # -std=gnu++11 52 | 53 | list(APPEND CUDA_NVCC_FLAGS "-D_FORCE_INLINES -Xcompiler -fPIC") 54 | CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR}) 55 | CUDA_ADD_LIBRARY(TrtNet STATIC ${PLUGIN_SOURCES}) 56 | 57 | target_include_directories(TrtNet PUBLIC ${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR}) 58 | target_link_libraries(TrtNet ${TENSORRT_LIBRARY}) -------------------------------------------------------------------------------- /code/include/EntroyCalibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef _ENTROY_CALIBRATOR_H 2 | #define _ENTROY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "NvInfer.h" 8 | #include "Utils.h" 9 | 10 | namespace nvinfer1 11 | { 12 | 13 | class Int8EntropyCalibrator : public IInt8EntropyCalibrator 14 | { 15 | public: 16 | Int8EntropyCalibrator(int BatchSize,const std::vector>& data,const std::string& CalibDataName = "",bool readCache = true); 17 | 18 | virtual ~Int8EntropyCalibrator(); 19 | 20 | int getBatchSize() const override { return mBatchSize; } 21 | 22 | bool getBatch(void* bindings[], const char* names[], int nbBindings) override; 23 | 24 | const void* readCalibrationCache(size_t& length) override; 25 | 26 | void writeCalibrationCache(const void* cache, size_t length) override; 27 | 28 | private: 29 | std::string mCalibDataName; 30 | std::vector> mDatas; 31 | int mBatchSize; 32 | 33 | int mCurBatchIdx; 34 | float* mCurBatchData{ nullptr }; 35 | 36 | size_t mInputCount; 37 | bool mReadCache; 38 | void* mDeviceInput{ nullptr }; 39 | 40 | std::vector mCalibrationCache; 41 | }; 42 | 43 | } //namespace 44 | 45 | #endif //_ENTROY_CALIBRATOR_H 46 | -------------------------------------------------------------------------------- /code/include/PluginFactory.h: -------------------------------------------------------------------------------- 1 | #ifndef __PLUGIN_FACTORY_H_ 2 | #define __PLUGIN_FACTORY_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "UpsampleLayer.h" 8 | #include "YoloLayer.h" 9 | #include "NvInferPlugin.h" 10 | #include "NvCaffeParser.h" 11 | 12 | namespace Tn 13 | { 14 | static constexpr float NEG_SLOPE = 0.1; 15 | static constexpr float UPSAMPLE_SCALE = 2.0; 16 | static constexpr int CUDA_THREAD_NUM = 512; 17 | 18 | // Integration for serialization. 19 | using nvinfer1::plugin::INvPlugin; 20 | using nvinfer1::plugin::createPReLUPlugin; 21 | using nvinfer1::UpsampleLayerPlugin; 22 | using nvinfer1::YoloLayerPlugin; 23 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactoryExt 24 | { 25 | public: 26 | inline bool isLeakyRelu(const char* layerName) 27 | { 28 | return std::regex_match(layerName , std::regex(R"(layer(\d*)-act)")); 29 | } 30 | 31 | inline bool isUpsample(const char* layerName) 32 | { 33 | return std::regex_match(layerName , std::regex(R"(layer(\d*)-upsample)")); 34 | } 35 | 36 | inline bool isYolo(const char* layerName) 37 | { 38 | return strcmp(layerName,"yolo-det") == 0; 39 | } 40 | 41 | virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override 42 | { 43 | assert(isPlugin(layerName)); 44 | 45 | if(isLeakyRelu(layerName)) 46 | { 47 | assert(nbWeights == 0 && weights == nullptr); 48 | mPluginLeakyRelu.emplace_back(std::unique_ptr(createPReLUPlugin(NEG_SLOPE), nvPluginDeleter)); 49 | return mPluginLeakyRelu.back().get(); 50 | } 51 | else if (isUpsample(layerName)) 52 | { 53 | assert(nbWeights == 0 && weights == nullptr); 54 | mPluginUpsample.emplace_back(std::unique_ptr(new UpsampleLayerPlugin(UPSAMPLE_SCALE,CUDA_THREAD_NUM))); 55 | return mPluginUpsample.back().get(); 56 | } 57 | else if (isYolo(layerName)) 58 | { 59 | assert(nbWeights == 0 && weights == nullptr && mPluginYolo.get() == nullptr); 60 | mPluginYolo.reset(new YoloLayerPlugin(CUDA_THREAD_NUM)); 61 | return mPluginYolo.get(); 62 | } 63 | else 64 | { 65 | assert(0); 66 | return nullptr; 67 | } 68 | } 69 | 70 | nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override 71 | { 72 | assert(isPlugin(layerName)); 73 | 74 | if (isLeakyRelu(layerName)) 75 | { 76 | mPluginLeakyRelu.emplace_back(std::unique_ptr(createPReLUPlugin(serialData, serialLength), nvPluginDeleter)); 77 | return mPluginLeakyRelu.back().get(); 78 | } 79 | else if (isUpsample(layerName)) 80 | { 81 | mPluginUpsample.emplace_back(std::unique_ptr(new UpsampleLayerPlugin(serialData, serialLength))); 82 | return mPluginUpsample.back().get(); 83 | } 84 | else if (isYolo(layerName)) 85 | { 86 | assert(mPluginYolo.get() == nullptr); 87 | mPluginYolo.reset(new YoloLayerPlugin(serialData, serialLength)); 88 | return mPluginYolo.get(); 89 | } 90 | else 91 | { 92 | assert(0); 93 | return nullptr; 94 | } 95 | } 96 | 97 | 98 | bool isPlugin(const char* name) override 99 | { 100 | return isPluginExt(name); 101 | } 102 | 103 | bool isPluginExt(const char* name) override 104 | { 105 | //std::cout << "check plugin " << name << isYolo(name)<< std::endl; 106 | return isLeakyRelu(name) || isUpsample(name) || isYolo(name); 107 | } 108 | 109 | // The application has to destroy the plugin when it knows it's safe to do so. 110 | void destroyPlugin() 111 | { 112 | for (auto& item : mPluginLeakyRelu) 113 | item.reset(); 114 | 115 | for (auto& item : mPluginUpsample) 116 | item.reset(); 117 | 118 | mPluginYolo.reset(); 119 | } 120 | 121 | void (*nvPluginDeleter)(INvPlugin*){[](INvPlugin* ptr) { if(ptr) ptr->destroy(); }}; 122 | 123 | std::vector> mPluginLeakyRelu{}; 124 | std::vector> mPluginUpsample{}; 125 | std::unique_ptr mPluginYolo {nullptr}; 126 | }; 127 | } 128 | 129 | #endif -------------------------------------------------------------------------------- /code/include/TrtNet.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_NET_H_ 2 | #define __TRT_NET_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "NvInferPlugin.h" 10 | #include "NvCaffeParser.h" 11 | #include "PluginFactory.h" 12 | #include "Utils.h" 13 | 14 | namespace Tn 15 | { 16 | enum class RUN_MODE 17 | { 18 | FLOAT32 = 0, 19 | FLOAT16 = 1, 20 | INT8 = 2 21 | }; 22 | 23 | class trtNet 24 | { 25 | public: 26 | //Load from caffe model 27 | trtNet(const std::string& prototxt,const std::string& caffeModel,const std::vector& outputNodesName, 28 | const std::vector>& calibratorData, RUN_MODE mode = RUN_MODE::FLOAT32 , int maxBatchSize = 1); 29 | 30 | //Load from engine file 31 | explicit trtNet(const std::string& engineFile); 32 | 33 | ~trtNet() 34 | { 35 | // Release the stream and the buffers 36 | cudaStreamSynchronize(mTrtCudaStream); 37 | cudaStreamDestroy(mTrtCudaStream); 38 | for(auto& item : mTrtCudaBuffer) 39 | cudaFree(item); 40 | 41 | mTrtPluginFactory.destroyPlugin(); 42 | 43 | if(!mTrtRunTime) 44 | mTrtRunTime->destroy(); 45 | if(!mTrtContext) 46 | mTrtContext->destroy(); 47 | if(!mTrtEngine) 48 | mTrtEngine->destroy(); 49 | }; 50 | 51 | void saveEngine(std::string fileName) 52 | { 53 | if(mTrtEngine) 54 | { 55 | nvinfer1::IHostMemory* data = mTrtEngine->serialize(); 56 | std::ofstream file; 57 | file.open(fileName,std::ios::binary | std::ios::out); 58 | if(!file.is_open()) 59 | { 60 | std::cout << "read create engine file" << fileName <<" failed" << std::endl; 61 | return; 62 | } 63 | 64 | file.write((const char*)data->data(), data->size()); 65 | file.close(); 66 | } 67 | }; 68 | 69 | void doInference(const void* inputData, void* outputData,int batchSize = 1); 70 | 71 | inline size_t getInputSize() { 72 | return std::accumulate(mTrtBindBufferSize.begin(), mTrtBindBufferSize.begin() + mTrtInputCount,0); 73 | }; 74 | 75 | inline size_t getOutputSize() { 76 | return std::accumulate(mTrtBindBufferSize.begin() + mTrtInputCount, mTrtBindBufferSize.end(),0); 77 | }; 78 | 79 | void printTime() 80 | { 81 | mTrtProfiler.printLayerTimes(mTrtIterationTime); 82 | } 83 | 84 | inline int getBatchSize() {return mTrtBatchSize;}; 85 | 86 | private: 87 | nvinfer1::ICudaEngine* loadModelAndCreateEngine(const char* deployFile, const char* modelFile,int maxBatchSize, 88 | nvcaffeparser1::ICaffeParser* parser, nvcaffeparser1::IPluginFactory* pluginFactory, 89 | nvinfer1::IInt8Calibrator* calibrator, nvinfer1::IHostMemory*& trtModelStream,const std::vector& outputNodesName); 90 | 91 | void InitEngine(); 92 | 93 | nvinfer1::IExecutionContext* mTrtContext; 94 | nvinfer1::ICudaEngine* mTrtEngine; 95 | nvinfer1::IRuntime* mTrtRunTime; 96 | PluginFactory mTrtPluginFactory; 97 | cudaStream_t mTrtCudaStream; 98 | Profiler mTrtProfiler; 99 | RUN_MODE mTrtRunMode; 100 | 101 | std::vector mTrtCudaBuffer; 102 | std::vector mTrtBindBufferSize; 103 | int mTrtInputCount; 104 | int mTrtIterationTime; 105 | int mTrtBatchSize; 106 | }; 107 | } 108 | 109 | #endif //__TRT_NET_H_ 110 | -------------------------------------------------------------------------------- /code/include/UpsampleLayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _UPSAMPLE_LAYER_H 2 | #define _UPSAMPLE_LAYER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "NvInfer.h" 10 | #include "Utils.h" 11 | #include 12 | 13 | namespace nvinfer1 14 | { 15 | class UpsampleLayerPlugin: public IPluginExt 16 | { 17 | public: 18 | explicit UpsampleLayerPlugin(const float scale, const int cudaThread = 512); 19 | // create the plugin at runtime from a byte stream 20 | UpsampleLayerPlugin(const void* data, size_t length); 21 | 22 | ~UpsampleLayerPlugin(); 23 | 24 | int getNbOutputs() const override 25 | { 26 | return 1; 27 | } 28 | 29 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 30 | 31 | bool supportsFormat(DataType type, PluginFormat format) const override { 32 | //std::cout << "supportsFormat=== type:" << int(type) << "format" << int(format) << std::endl; 33 | return (type == DataType::kFLOAT || type == DataType::kHALF || type == DataType::kINT8 ) 34 | && format == PluginFormat::kNCHW; 35 | } 36 | 37 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override; 38 | 39 | int initialize() override; 40 | 41 | virtual void terminate() override { 42 | }; 43 | 44 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} 45 | 46 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 47 | 48 | virtual size_t getSerializationSize() override 49 | { 50 | return sizeof(nvinfer1::Dims) + sizeof(mDataType) + sizeof(mScale) 51 | + sizeof(mOutputWidth) + sizeof(mOutputHeight) + sizeof(mThreadCount); 52 | } 53 | 54 | virtual void serialize(void* buffer) override; 55 | 56 | template 57 | void forwardGpu(const Dtype* input,Dtype * outputint ,int N,int C,int H ,int W); 58 | 59 | private: 60 | nvinfer1::Dims mCHW; 61 | DataType mDataType{DataType::kFLOAT}; 62 | float mScale; 63 | int mOutputWidth; 64 | int mOutputHeight; 65 | int mThreadCount; 66 | 67 | void* mInputBuffer {nullptr}; 68 | void* mOutputBuffer {nullptr}; 69 | }; 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /code/include/Utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | class Profiler : public nvinfer1::IProfiler 25 | { 26 | public: 27 | void printLayerTimes(int itrationsTimes) 28 | { 29 | float totalTime = 0; 30 | for (size_t i = 0; i < mProfile.size(); i++) 31 | { 32 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); 33 | totalTime += mProfile[i].second; 34 | } 35 | printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); 36 | } 37 | private: 38 | typedef std::pair Record; 39 | std::vector mProfile; 40 | 41 | virtual void reportLayerTime(const char* layerName, float ms) 42 | { 43 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 44 | if (record == mProfile.end()) 45 | mProfile.push_back(std::make_pair(layerName, ms)); 46 | else 47 | record->second += ms; 48 | } 49 | }; 50 | 51 | //Logger for TensorRT info/warning/errors 52 | class Logger : public nvinfer1::ILogger 53 | { 54 | public: 55 | 56 | Logger(): Logger(Severity::kWARNING) {} 57 | 58 | Logger(Severity severity): reportableSeverity(severity) {} 59 | 60 | void log(Severity severity, const char* msg) override 61 | { 62 | // suppress messages with severity enum value greater than the reportable 63 | if (severity > reportableSeverity) return; 64 | 65 | switch (severity) 66 | { 67 | case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; 68 | case Severity::kERROR: std::cerr << "ERROR: "; break; 69 | case Severity::kWARNING: std::cerr << "WARNING: "; break; 70 | case Severity::kINFO: std::cerr << "INFO: "; break; 71 | default: std::cerr << "UNKNOWN: "; break; 72 | } 73 | std::cerr << msg << std::endl; 74 | } 75 | 76 | Severity reportableSeverity{Severity::kWARNING}; 77 | }; 78 | 79 | template 80 | void write(char*& buffer, const T& val) 81 | { 82 | *reinterpret_cast(buffer) = val; 83 | buffer += sizeof(T); 84 | } 85 | 86 | template 87 | void read(const char*& buffer, T& val) 88 | { 89 | val = *reinterpret_cast(buffer); 90 | buffer += sizeof(T); 91 | } 92 | } 93 | 94 | #endif -------------------------------------------------------------------------------- /code/include/YoloConfigs.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_CONFIGS_H_ 2 | #define _YOLO_CONFIGS_H_ 3 | 4 | 5 | namespace Yolo 6 | { 7 | static constexpr int CHECK_COUNT = 3; 8 | static constexpr float IGNORE_THRESH = 0.5f; 9 | static constexpr int CLASS_NUM = 80; 10 | 11 | struct YoloKernel 12 | { 13 | int width; 14 | int height; 15 | float anchors[CHECK_COUNT*2]; 16 | }; 17 | 18 | //YOLO 608 19 | YoloKernel yolo1 = { 20 | 19, 21 | 19, 22 | {116,90, 156,198, 373,326} 23 | }; 24 | YoloKernel yolo2 = { 25 | 38, 26 | 38, 27 | {30,61, 62,45, 59,119} 28 | }; 29 | YoloKernel yolo3 = { 30 | 76, 31 | 76, 32 | {10,13, 16,30, 33,23} 33 | }; 34 | 35 | //YOLO 416 36 | // YoloKernel yolo1 = { 37 | // 13, 38 | // 13, 39 | // {116,90, 156,198, 373,326} 40 | // }; 41 | // YoloKernel yolo2 = { 42 | // 26, 43 | // 26, 44 | // {30,61, 62,45, 59,119} 45 | // }; 46 | // YoloKernel yolo3 = { 47 | // 52, 48 | // 52, 49 | // {10,13, 16,30, 33,23} 50 | // }; 51 | } 52 | 53 | #endif -------------------------------------------------------------------------------- /code/include/YoloLayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "NvInfer.h" 10 | #include "Utils.h" 11 | #include 12 | 13 | namespace Yolo 14 | { 15 | struct YoloKernel; 16 | 17 | static constexpr int LOCATIONS = 4; 18 | struct alignas(float) Detection{ 19 | //x y w h 20 | float bbox[LOCATIONS]; 21 | //float objectness; 22 | int classId; 23 | float prob; 24 | }; 25 | } 26 | 27 | 28 | namespace nvinfer1 29 | { 30 | class YoloLayerPlugin: public IPluginExt 31 | { 32 | public: 33 | explicit YoloLayerPlugin(const int cudaThread = 512); 34 | YoloLayerPlugin(const void* data, size_t length); 35 | 36 | ~YoloLayerPlugin(); 37 | 38 | int getNbOutputs() const override 39 | { 40 | return 1; 41 | } 42 | 43 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 44 | 45 | bool supportsFormat(DataType type, PluginFormat format) const override { 46 | return type == DataType::kFLOAT && format == PluginFormat::kNCHW; 47 | } 48 | 49 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override {}; 50 | 51 | int initialize() override; 52 | 53 | virtual void terminate() override {}; 54 | 55 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} 56 | 57 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 58 | 59 | virtual size_t getSerializationSize() override; 60 | 61 | virtual void serialize(void* buffer) override; 62 | 63 | void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); 64 | 65 | void forwardCpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); 66 | 67 | private: 68 | int mClassCount; 69 | int mKernelCount; 70 | std::vector mYoloKernel; 71 | int mThreadCount; 72 | 73 | //cpu 74 | void* mInputBuffer {nullptr}; 75 | void* mOutputBuffer {nullptr}; 76 | }; 77 | }; 78 | 79 | #endif 80 | -------------------------------------------------------------------------------- /code/src/EntroyCalibrator.cpp: -------------------------------------------------------------------------------- 1 | #include "EntroyCalibrator.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace nvinfer1 9 | { 10 | Int8EntropyCalibrator::Int8EntropyCalibrator(int BatchSize,const std::vector>& data, 11 | const std::string& CalibDataName /*= ""*/,bool readCache /*= true*/) 12 | : mCalibDataName(CalibDataName),mBatchSize(BatchSize),mReadCache(readCache) 13 | { 14 | mDatas.reserve(data.size()); 15 | mDatas = data; 16 | 17 | mInputCount = BatchSize * data[0].size(); 18 | mCurBatchData = new float[mInputCount]; 19 | mCurBatchIdx = 0; 20 | CUDA_CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); 21 | } 22 | 23 | 24 | Int8EntropyCalibrator::~Int8EntropyCalibrator() 25 | { 26 | CUDA_CHECK(cudaFree(mDeviceInput)); 27 | if(mCurBatchData) 28 | delete[] mCurBatchData; 29 | } 30 | 31 | 32 | bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], int nbBindings) 33 | { 34 | if (mCurBatchIdx + mBatchSize > int(mDatas.size())) 35 | return false; 36 | 37 | float* ptr = mCurBatchData; 38 | size_t imgSize = mInputCount / mBatchSize; 39 | auto iter = mDatas.begin() + mCurBatchIdx; 40 | 41 | std::for_each(iter, iter + mBatchSize, [=,&ptr](std::vector& val){ 42 | assert(imgSize == val.size()); 43 | memcpy(ptr,val.data(),imgSize*sizeof(float)); 44 | 45 | ptr += imgSize; 46 | }); 47 | 48 | CUDA_CHECK(cudaMemcpy(mDeviceInput, mCurBatchData, mInputCount * sizeof(float), cudaMemcpyHostToDevice)); 49 | //std::cout << "input name " << names[0] << std::endl; 50 | bindings[0] = mDeviceInput; 51 | 52 | std::cout << "load batch " << mCurBatchIdx << " to " << mCurBatchIdx + mBatchSize - 1 << std::endl; 53 | mCurBatchIdx += mBatchSize; 54 | return true; 55 | } 56 | 57 | const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length) 58 | { 59 | mCalibrationCache.clear(); 60 | std::ifstream input(mCalibDataName+".calib", std::ios::binary); 61 | input >> std::noskipws; 62 | if (mReadCache && input.good()) 63 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); 64 | 65 | length = mCalibrationCache.size(); 66 | return length ? &mCalibrationCache[0] : nullptr; 67 | } 68 | 69 | void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length) 70 | { 71 | std::ofstream output(mCalibDataName+".calib", std::ios::binary); 72 | output.write(reinterpret_cast(cache), length); 73 | } 74 | 75 | } -------------------------------------------------------------------------------- /code/src/TrtNet.cpp: -------------------------------------------------------------------------------- 1 | #include "TrtNet.h" 2 | #include "EntroyCalibrator.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace nvinfer1; 14 | using namespace nvcaffeparser1; 15 | using namespace plugin; 16 | 17 | static Tn::Logger gLogger; 18 | 19 | #define RETURN_AND_LOG(ret, severity, message) \ 20 | do \ 21 | { \ 22 | std::string error_message = "ssd_error_log: " + std::string(message); \ 23 | gLogger.log(ILogger::Severity::k##severity, error_message.c_str()); \ 24 | return (ret); \ 25 | } while (0) 26 | 27 | inline void* safeCudaMalloc(size_t memSize) 28 | { 29 | void* deviceMem; 30 | CUDA_CHECK(cudaMalloc(&deviceMem, memSize)); 31 | if (deviceMem == nullptr) 32 | { 33 | std::cerr << "Out of memory" << std::endl; 34 | exit(1); 35 | } 36 | return deviceMem; 37 | } 38 | 39 | inline int64_t volume(const nvinfer1::Dims& d) 40 | { 41 | return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); 42 | } 43 | 44 | inline unsigned int getElementSize(nvinfer1::DataType t) 45 | { 46 | switch (t) 47 | { 48 | case nvinfer1::DataType::kINT32: return 4; 49 | case nvinfer1::DataType::kFLOAT: return 4; 50 | case nvinfer1::DataType::kHALF: return 2; 51 | case nvinfer1::DataType::kINT8: return 1; 52 | } 53 | throw std::runtime_error("Invalid DataType."); 54 | return 0; 55 | } 56 | 57 | namespace Tn 58 | { 59 | trtNet::trtNet(const std::string& prototxt,const std::string& caffemodel,const std::vector& outputNodesName, 60 | const std::vector>& calibratorData,RUN_MODE mode /*= RUN_MODE::FLOAT32*/,int maxBatchSize /*= 1*/) 61 | :mTrtContext(nullptr),mTrtEngine(nullptr),mTrtRunTime(nullptr),mTrtRunMode(mode),mTrtInputCount(0),mTrtIterationTime(0),mTrtBatchSize(maxBatchSize) 62 | { 63 | std::cout << "init plugin proto: " << prototxt << " caffemodel: " << caffemodel << std::endl; 64 | auto parser = createCaffeParser(); 65 | 66 | IHostMemory* trtModelStream{nullptr}; 67 | 68 | Int8EntropyCalibrator * calibrator = nullptr; 69 | if (calibratorData.size() > 0 ){ 70 | auto endPos= prototxt.find_last_of("."); 71 | auto beginPos= prototxt.find_last_of('/') + 1; 72 | std::string calibratorName = prototxt.substr(beginPos,endPos - beginPos); 73 | std::cout << "create calibrator,Named:" << calibratorName << std::endl; 74 | calibrator = new Int8EntropyCalibrator(maxBatchSize,calibratorData,calibratorName); 75 | } 76 | 77 | PluginFactory pluginFactorySerialize; 78 | ICudaEngine* tmpEngine = loadModelAndCreateEngine(prototxt.c_str(),caffemodel.c_str(), maxBatchSize, parser, &pluginFactorySerialize, calibrator, trtModelStream,outputNodesName); 79 | assert(tmpEngine != nullptr); 80 | assert(trtModelStream != nullptr); 81 | if(calibrator){ 82 | delete calibrator; 83 | calibrator = nullptr; 84 | } 85 | tmpEngine->destroy(); 86 | pluginFactorySerialize.destroyPlugin(); 87 | 88 | mTrtRunTime = createInferRuntime(gLogger); 89 | assert(mTrtRunTime != nullptr); 90 | mTrtEngine= mTrtRunTime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), &mTrtPluginFactory); 91 | assert(mTrtEngine != nullptr); 92 | // Deserialize the engine. 93 | trtModelStream->destroy(); 94 | 95 | InitEngine(); 96 | } 97 | 98 | trtNet::trtNet(const std::string& engineFile) 99 | :mTrtContext(nullptr),mTrtEngine(nullptr),mTrtRunTime(nullptr),mTrtRunMode(RUN_MODE::FLOAT32),mTrtInputCount(0),mTrtIterationTime(0) 100 | { 101 | using namespace std; 102 | fstream file; 103 | 104 | file.open(engineFile,ios::binary | ios::in); 105 | if(!file.is_open()) 106 | { 107 | cout << "read engine file" << engineFile <<" failed" << endl; 108 | return; 109 | } 110 | file.seekg(0, ios::end); 111 | int length = file.tellg(); 112 | file.seekg(0, ios::beg); 113 | std::unique_ptr data(new char[length]); 114 | file.read(data.get(), length); 115 | 116 | file.close(); 117 | 118 | std::cout << "deserializing" << std::endl; 119 | mTrtRunTime = createInferRuntime(gLogger); 120 | assert(mTrtRunTime != nullptr); 121 | mTrtEngine= mTrtRunTime->deserializeCudaEngine(data.get(), length, &mTrtPluginFactory); 122 | assert(mTrtEngine != nullptr); 123 | 124 | InitEngine(); 125 | } 126 | 127 | void trtNet::InitEngine() 128 | { 129 | mTrtBatchSize = mTrtEngine->getMaxBatchSize(); 130 | mTrtContext = mTrtEngine->createExecutionContext(); 131 | assert(mTrtContext != nullptr); 132 | mTrtContext->setProfiler(&mTrtProfiler); 133 | 134 | // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings() 135 | int nbBindings = mTrtEngine->getNbBindings(); 136 | 137 | mTrtCudaBuffer.resize(nbBindings); 138 | mTrtBindBufferSize.resize(nbBindings); 139 | for (int i = 0; i < nbBindings; ++i) 140 | { 141 | Dims dims = mTrtEngine->getBindingDimensions(i); 142 | DataType dtype = mTrtEngine->getBindingDataType(i); 143 | int64_t totalSize = volume(dims) * mTrtBatchSize * getElementSize(dtype); 144 | mTrtBindBufferSize[i] = totalSize; 145 | mTrtCudaBuffer[i] = safeCudaMalloc(totalSize); 146 | if(mTrtEngine->bindingIsInput(i)) 147 | mTrtInputCount++; 148 | } 149 | 150 | CUDA_CHECK(cudaStreamCreate(&mTrtCudaStream)); 151 | } 152 | 153 | 154 | nvinfer1::ICudaEngine* trtNet::loadModelAndCreateEngine(const char* deployFile, const char* modelFile,int maxBatchSize, 155 | ICaffeParser* parser, nvcaffeparser1::IPluginFactory* pluginFactory, 156 | IInt8Calibrator* calibrator, IHostMemory*& trtModelStream,const std::vector& outputNodesName) 157 | { 158 | // Create the builder 159 | IBuilder* builder = createInferBuilder(gLogger); 160 | 161 | // Parse the model to populate the network, then set the outputs. 162 | INetworkDefinition* network = builder->createNetwork(); 163 | parser->setPluginFactory(pluginFactory); 164 | 165 | std::cout << "Begin parsing model..." << std::endl; 166 | const IBlobNameToTensor* blobNameToTensor = parser->parse(deployFile,modelFile, *network, nvinfer1::DataType::kFLOAT); 167 | if (!blobNameToTensor) 168 | RETURN_AND_LOG(nullptr, ERROR, "Fail to parse"); 169 | std::cout << "End parsing model..." << std::endl; 170 | 171 | // specify which tensors are outputs 172 | for (auto& name : outputNodesName) 173 | { 174 | auto output = blobNameToTensor->find(name.c_str()); 175 | assert(output!=nullptr); 176 | if (output == nullptr) 177 | std::cout << "can not find output named " << name << std::endl; 178 | 179 | network->markOutput(*output); 180 | } 181 | 182 | // Build the engine. 183 | builder->setMaxBatchSize(maxBatchSize); 184 | builder->setMaxWorkspaceSize(1 << 30);// 1G 185 | if (mTrtRunMode == RUN_MODE::INT8) 186 | { 187 | std::cout <<"setInt8Mode"<platformHasFastInt8()) 189 | std::cout << "Notice: the platform do not has fast for int8" << std::endl; 190 | builder->setInt8Mode(true); 191 | builder->setInt8Calibrator(calibrator); 192 | } 193 | else if (mTrtRunMode == RUN_MODE::FLOAT16) 194 | { 195 | std::cout <<"setFp16Mode"<platformHasFastFp16()) 197 | std::cout << "Notice: the platform do not has fast for fp16" << std::endl; 198 | builder->setFp16Mode(true); 199 | } 200 | 201 | std::cout << "Begin building engine..." << std::endl; 202 | ICudaEngine* engine = builder->buildCudaEngine(*network); 203 | if (!engine) 204 | RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine"); 205 | std::cout << "End building engine..." << std::endl; 206 | 207 | // We don't need the network any more, and we can destroy the parser. 208 | network->destroy(); 209 | parser->destroy(); 210 | 211 | // Serialize the engine, then close everything down. 212 | trtModelStream = engine->serialize(); 213 | 214 | builder->destroy(); 215 | shutdownProtobufLibrary(); 216 | return engine; 217 | } 218 | 219 | void trtNet::doInference(const void* inputData, void* outputData ,int batchSize /*= 1*/) 220 | { 221 | //static const int batchSize = 1; 222 | assert(mTrtInputCount == 1); 223 | assert(batchSize <= mTrtBatchSize); 224 | 225 | // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: 226 | int inputIndex = 0; 227 | CUDA_CHECK(cudaMemcpyAsync(mTrtCudaBuffer[inputIndex], inputData, mTrtBindBufferSize[inputIndex], cudaMemcpyHostToDevice, mTrtCudaStream)); 228 | auto t_start = std::chrono::high_resolution_clock::now(); 229 | mTrtContext->execute(batchSize, &mTrtCudaBuffer[inputIndex]); 230 | auto t_end = std::chrono::high_resolution_clock::now(); 231 | float total = std::chrono::duration(t_end - t_start).count(); 232 | std::cout << "Time taken for inference is " << total << " ms." << std::endl; 233 | 234 | for (size_t bindingIdx = mTrtInputCount; bindingIdx < mTrtBindBufferSize.size(); ++bindingIdx) 235 | { 236 | auto size = mTrtBindBufferSize[bindingIdx]; 237 | CUDA_CHECK(cudaMemcpyAsync(outputData, mTrtCudaBuffer[bindingIdx], size, cudaMemcpyDeviceToHost, mTrtCudaStream)); 238 | outputData = (char *)outputData + size; 239 | } 240 | 241 | //cudaStreamSynchronize(mTrtCudaStream); 242 | 243 | mTrtIterationTime ++ ; 244 | } 245 | } -------------------------------------------------------------------------------- /code/src/UpsampleLayer.cpp: -------------------------------------------------------------------------------- 1 | #include "UpsampleLayer.h" 2 | 3 | namespace nvinfer1 4 | { 5 | UpsampleLayerPlugin::UpsampleLayerPlugin(const float scale, const int cudaThread /*= 512*/) 6 | : mScale(scale),mThreadCount(cudaThread) 7 | { 8 | } 9 | 10 | UpsampleLayerPlugin::~UpsampleLayerPlugin() 11 | { 12 | 13 | } 14 | 15 | // create the plugin at runtime from a byte stream 16 | UpsampleLayerPlugin::UpsampleLayerPlugin(const void* data, size_t length) 17 | { 18 | using namespace Tn; 19 | const char *d = reinterpret_cast(data), *a = d; 20 | read(d, mCHW); 21 | read(d, mDataType); 22 | read(d, mScale); 23 | read(d, mOutputWidth); 24 | read(d, mOutputHeight); 25 | read(d, mThreadCount); 26 | 27 | //std::cout << "read:" << a << " " << mOutputWidth<< " " <(buffer), *a = d; 35 | write(d, mCHW); 36 | write(d, mDataType); 37 | write(d, mScale); 38 | write(d, mOutputWidth); 39 | write(d, mOutputHeight); 40 | write(d, mThreadCount); 41 | 42 | //std::cout << "write:" << a << " " << mOutputHeight<< " " < 22 | __global__ void upscale(const Dtype *input, Dtype *output, 23 | int no_elements, int scale_factor, int d1, int d2, int d3) { 24 | int ii = threadIdx.x + blockDim.x * blockIdx.x; 25 | if (ii >= no_elements) return; 26 | int ipidx = translate_idx(ii, d1, d2, d3, scale_factor); 27 | output[ii]=input[ipidx]; 28 | } 29 | 30 | template 31 | void UpsampleLayerPlugin::forwardGpu(const Dtype* input,Dtype * output, 32 | int N,int C,int H ,int W) { 33 | 34 | int numElem = N*C*H*W; 35 | upscale<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>>(input,output, numElem, mScale, C, H, W); 36 | } 37 | 38 | size_t type2size(DataType dataType) { 39 | size_t _size = 0; 40 | switch (dataType) 41 | { 42 | case DataType::kFLOAT: _size = sizeof(float);break; 43 | case DataType::kHALF: _size = sizeof(__half);break; 44 | case DataType::kINT8: _size = sizeof(u_int8_t);break; 45 | default:std::cerr << "error data type" << std::endl; 46 | } 47 | return _size; 48 | } 49 | 50 | int UpsampleLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 51 | { 52 | const int channels = mCHW.d[0]; 53 | const int64_t in_height = mCHW.d[1]; 54 | const int64_t in_width = mCHW.d[2]; 55 | const int64_t out_height = mOutputHeight; 56 | const int64_t out_width = mOutputWidth; 57 | int totalElems = batchSize * in_height * in_width * channels; 58 | 59 | // Handle no-op resizes efficiently. 60 | if (out_height == in_height && out_width == in_width) { 61 | CUDA_CHECK(cudaMemcpyAsync(outputs[0], inputs[0], totalElems * type2size(mDataType), cudaMemcpyDeviceToDevice, stream)); 62 | CUDA_CHECK(cudaStreamSynchronize(stream)); 63 | return 0; 64 | } 65 | //CUDA_CHECK(cudaStreamSynchronize(stream)); 66 | 67 | switch (mDataType) 68 | { 69 | case DataType::kFLOAT : 70 | forwardGpu((const float *)inputs[0],(float *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth); 71 | break; 72 | case DataType::kHALF: 73 | forwardGpu<__half>((const __half *)inputs[0],(__half *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth); 74 | break; 75 | case DataType::kINT8: 76 | forwardGpu((const u_int8_t *)inputs[0],(u_int8_t *)outputs[0],batchSize,mCHW.d[0],mOutputHeight,mOutputWidth); 77 | break; 78 | default: 79 | std::cerr << "error data type" << std::endl; 80 | } 81 | return 0; 82 | }; 83 | } -------------------------------------------------------------------------------- /code/src/YoloLayer.cu: -------------------------------------------------------------------------------- 1 | #include "YoloConfigs.h" 2 | #include "YoloLayer.h" 3 | 4 | using namespace Yolo; 5 | 6 | namespace nvinfer1 7 | { 8 | YoloLayerPlugin::YoloLayerPlugin(const int cudaThread /*= 512*/):mThreadCount(cudaThread) 9 | { 10 | mClassCount = CLASS_NUM; 11 | mYoloKernel.clear(); 12 | mYoloKernel.push_back(yolo1); 13 | mYoloKernel.push_back(yolo2); 14 | mYoloKernel.push_back(yolo3); 15 | 16 | mKernelCount = mYoloKernel.size(); 17 | } 18 | 19 | YoloLayerPlugin::~YoloLayerPlugin() 20 | { 21 | if(mInputBuffer) 22 | CUDA_CHECK(cudaFreeHost(mInputBuffer)); 23 | 24 | if(mOutputBuffer) 25 | CUDA_CHECK(cudaFreeHost(mOutputBuffer)); 26 | } 27 | 28 | // create the plugin at runtime from a byte stream 29 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 30 | { 31 | using namespace Tn; 32 | const char *d = reinterpret_cast(data), *a = d; 33 | read(d, mClassCount); 34 | read(d, mThreadCount); 35 | read(d, mKernelCount); 36 | mYoloKernel.resize(mKernelCount); 37 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 38 | memcpy(mYoloKernel.data(),d,kernelSize); 39 | d += kernelSize; 40 | 41 | assert(d == a + length); 42 | } 43 | 44 | void YoloLayerPlugin::serialize(void* buffer) 45 | { 46 | using namespace Tn; 47 | char* d = static_cast(buffer), *a = d; 48 | write(d, mClassCount); 49 | write(d, mThreadCount); 50 | write(d, mKernelCount); 51 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 52 | memcpy(d,mYoloKernel.data(),kernelSize); 53 | d += kernelSize; 54 | 55 | assert(d == a + getSerializationSize()); 56 | } 57 | 58 | size_t YoloLayerPlugin::getSerializationSize() 59 | { 60 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); 61 | } 62 | 63 | int YoloLayerPlugin::initialize() 64 | { 65 | int totalCount = 0; 66 | for(const auto& yolo : mYoloKernel) 67 | totalCount += (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT; 68 | CUDA_CHECK(cudaHostAlloc(&mInputBuffer, totalCount * sizeof(float), cudaHostAllocDefault)); 69 | 70 | totalCount = 0;//detection count 71 | for(const auto& yolo : mYoloKernel) 72 | totalCount += yolo.width*yolo.height * CHECK_COUNT; 73 | CUDA_CHECK(cudaHostAlloc(&mOutputBuffer, sizeof(float) + totalCount * sizeof(Detection), cudaHostAllocDefault)); 74 | return 0; 75 | } 76 | 77 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 78 | { 79 | //output the result to channel 80 | int totalCount = 0; 81 | for(const auto& yolo : mYoloKernel) 82 | totalCount += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float); 83 | 84 | return Dims3(totalCount + 1, 1, 1); 85 | } 86 | 87 | void YoloLayerPlugin::forwardCpu(const float*const * inputs, float* outputs, cudaStream_t stream,int batchSize) 88 | { 89 | auto Logist = [=](float data){ 90 | return 1./(1. + exp(-data)); 91 | }; 92 | 93 | int totalOutputCount = 0; 94 | int i = 0; 95 | int totalCount = 0; 96 | for(const auto& yolo : mYoloKernel) 97 | { 98 | totalOutputCount += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float); 99 | totalCount += (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT; 100 | ++ i; 101 | } 102 | 103 | for (int idx = 0; idx < batchSize;idx++) 104 | { 105 | i = 0; 106 | float* inputData = (float *)mInputBuffer;// + idx *totalCount; //if create more batch size 107 | for(const auto& yolo : mYoloKernel) 108 | { 109 | int size = (LOCATIONS + 1 + mClassCount) * yolo.width*yolo.height * CHECK_COUNT; 110 | CUDA_CHECK(cudaMemcpyAsync(inputData, (float *)inputs[i] + idx * size, size * sizeof(float), cudaMemcpyDeviceToHost, stream)); 111 | inputData += size; 112 | ++ i; 113 | } 114 | 115 | CUDA_CHECK(cudaStreamSynchronize(stream)); 116 | 117 | inputData = (float *)mInputBuffer ;//+ idx *totalCount; //if create more batch size 118 | std::vector result; 119 | for (const auto& yolo : mYoloKernel) 120 | { 121 | int stride = yolo.width*yolo.height; 122 | for (int j = 0;j < stride ;++j) 123 | { 124 | for (int k = 0;k < CHECK_COUNT; ++k ) 125 | { 126 | int beginIdx = (LOCATIONS + 1 + mClassCount)* stride *k + j; 127 | int objIndex = beginIdx + LOCATIONS*stride; 128 | 129 | //check obj 130 | float objProb = Logist(inputData[objIndex]); 131 | if(objProb <= IGNORE_THRESH) 132 | continue; 133 | 134 | //classes 135 | int classId = -1; 136 | float maxProb = IGNORE_THRESH; 137 | for (int c = 0;c< mClassCount;++c){ 138 | float cProb = Logist(inputData[beginIdx + (5 + c) * stride]) * objProb; 139 | if(cProb > maxProb){ 140 | maxProb = cProb; 141 | classId = c; 142 | } 143 | } 144 | 145 | if(classId >= 0) { 146 | Detection det; 147 | int row = j / yolo.width; 148 | int cols = j % yolo.width; 149 | 150 | //Location 151 | det.bbox[0] = (cols + Logist(inputData[beginIdx]))/ yolo.width; 152 | det.bbox[1] = (row + Logist(inputData[beginIdx+stride]))/ yolo.height; 153 | det.bbox[2] = exp(inputData[beginIdx+2*stride]) * yolo.anchors[2*k]; 154 | det.bbox[3] = exp(inputData[beginIdx+3*stride]) * yolo.anchors[2*k + 1]; 155 | det.classId = classId; 156 | det.prob = maxProb; 157 | 158 | result.emplace_back(det); 159 | } 160 | } 161 | } 162 | 163 | inputData += (LOCATIONS + 1 + mClassCount) * stride * CHECK_COUNT; 164 | } 165 | 166 | 167 | int detCount =result.size(); 168 | auto data = (float *)mOutputBuffer;// + idx*(totalOutputCount + 1); //if create more batch size 169 | float * begin = data; 170 | //copy count; 171 | data[0] = (float)detCount; 172 | data++; 173 | //copy result 174 | memcpy(data,result.data(),result.size()*sizeof(Detection)); 175 | 176 | //(count + det result) 177 | CUDA_CHECK(cudaMemcpyAsync(outputs, begin,sizeof(float) + result.size()*sizeof(Detection), cudaMemcpyHostToDevice, stream)); 178 | 179 | outputs += totalOutputCount + 1; 180 | } 181 | }; 182 | 183 | __device__ float Logist(float data){ return 1./(1. + exp(-data)); }; 184 | 185 | __global__ void CalDetection(const float *input, float *output,int noElements, 186 | int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { 187 | 188 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 189 | if (idx >= noElements) return; 190 | 191 | int stride = yoloWidth*yoloHeight; 192 | int bnIdx = idx / stride; 193 | 194 | int curIdx = idx - stride*bnIdx; 195 | 196 | const float* curInput = input + bnIdx* ((LOCATIONS + 1 + classes) * stride * CHECK_COUNT); 197 | 198 | for (int k = 0;k < CHECK_COUNT; ++k ) 199 | { 200 | int beginIdx = (LOCATIONS + 1 + classes)* stride *k + curIdx; 201 | int objIndex = beginIdx + LOCATIONS*stride; 202 | 203 | //check objectness 204 | float objProb = Logist(curInput[objIndex]); 205 | if(objProb <= IGNORE_THRESH) 206 | continue; 207 | 208 | int row = curIdx / yoloWidth; 209 | int cols = curIdx % yoloWidth; 210 | 211 | //classes 212 | int classId = -1; 213 | float maxProb = IGNORE_THRESH; 214 | for (int c = 0;c maxProb){ 217 | maxProb = cProb; 218 | classId = c; 219 | } 220 | } 221 | 222 | if(classId >= 0) { 223 | float *curOutput = output + bnIdx*outputElem; 224 | int resCount = (int)atomicAdd(curOutput,1); 225 | char* data = (char * )curOutput + sizeof(float) + resCount*sizeof(Detection); 226 | Detection* det = (Detection*)(data); 227 | 228 | //Location 229 | det->bbox[0] = (cols + Logist(curInput[beginIdx]))/ yoloWidth; 230 | det->bbox[1] = (row + Logist(curInput[beginIdx+stride]))/ yoloHeight; 231 | det->bbox[2] = exp(curInput[beginIdx+2*stride]) * anchors[2*k]; 232 | det->bbox[3] = exp(curInput[beginIdx+3*stride]) * anchors[2*k + 1]; 233 | det->classId = classId; 234 | det->prob = maxProb; 235 | } 236 | } 237 | } 238 | 239 | void YoloLayerPlugin::forwardGpu(const float *const * inputs,float * output,cudaStream_t stream,int batchSize) { 240 | void* devAnchor; 241 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 242 | CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen)); 243 | 244 | int outputElem = 1; 245 | for (unsigned int i = 0;i< mYoloKernel.size();++i) 246 | { 247 | const auto& yolo = mYoloKernel[i]; 248 | outputElem += yolo.width*yolo.height * CHECK_COUNT * sizeof(Detection) / sizeof(float); 249 | } 250 | 251 | for(int idx = 0 ;idx < batchSize;++idx) 252 | CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); 253 | 254 | int numElem = 0; 255 | for (unsigned int i = 0;i< mYoloKernel.size();++i) 256 | { 257 | const auto& yolo = mYoloKernel[i]; 258 | numElem = yolo.width*yolo.height*batchSize; 259 | CUDA_CHECK(cudaMemcpy(devAnchor,yolo.anchors,AnchorLen,cudaMemcpyHostToDevice)); 260 | CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> 261 | (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem); 262 | } 263 | 264 | CUDA_CHECK(cudaFree(devAnchor)); 265 | } 266 | 267 | 268 | int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 269 | { 270 | //assert(batchSize == 1); 271 | //GPU 272 | //CUDA_CHECK(cudaStreamSynchronize(stream)); 273 | forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize); 274 | 275 | //CPU 276 | //forwardCpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize); 277 | return 0; 278 | }; 279 | 280 | } 281 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # TRTWrapper 2 | 3 | ### Desc 4 | a wrapper for tensorRT net (parser caffe) 5 | ### Test Environments 6 | Ubuntu 16.04 7 | TensorRT 5.0.2.6/4.0.1.6 8 | CUDA 9.2 9 | ### About Wraper 10 | you can use the wrapper like this: 11 | ```cpp 12 | //normal 13 | std::vector> calibratorData; 14 | trtNet net("vgg16.prototxt","vgg16.caffemodel",{"prob"},calibratorData); 15 | //fp16 16 | trtNet net_fp16("vgg16.prototxt","vgg16.caffemodel",{"prob"},calibratorData,RUN_MODE:FLOAT16); 17 | //int8 18 | trtNet net_int8("vgg16.prototxt","vgg16.caffemodel",{"prob"},calibratorData,RUN_MODE:INT8); 19 | 20 | //run inference: 21 | net.doInference(input_data.get(), outputData.get()); 22 | 23 | //can print time cost 24 | net.printTime(); 25 | 26 | //can write to engine and load From engine 27 | net.saveEngine("save_1.engine"); 28 | trtNet net2("save_1.engine"); 29 | ``` 30 | when you need add new plugin ,just add the plugin code to pluginFactory 31 | ### Run Sample 32 | ```bash 33 | #for classification 34 | cd sample 35 | mkdir build 36 | cd build && cmake .. && make && make install 37 | cd .. 38 | ./install/runNet --caffemodel=${CAFFE_MODEL_NAME} --prototxt=${CAFFE_PROTOTXT} --input=./test.jpg 39 | ``` 40 | -------------------------------------------------------------------------------- /sample/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(demo) 3 | 4 | set(CMAKE_BUILD_TYPE Release) 5 | 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors") 7 | 8 | #add lib 9 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../code sample.out) 10 | include_directories({$CMAKE_CURRENT_SOURCE_DIR}/../code) 11 | 12 | #add opencv 13 | FIND_PACKAGE(OpenCV REQUIRED) 14 | include_directories(${OpenCV_INCLUDE_DIRS}) 15 | link_directories(${OpenCV_LIBRARIES_DIRS}) 16 | 17 | #build classifer 18 | add_executable(classifer evalClassifNet.cpp eval.cpp dataReader.cpp) 19 | target_link_libraries(classifer TrtNet ${OpenCV_LIBS}) 20 | 21 | #build runNet 22 | add_executable(runNet runNet.cpp) 23 | target_link_libraries(runNet TrtNet ${OpenCV_LIBS}) 24 | 25 | #build runTwoNets 26 | add_executable(runTwoNets runTwoNets.cpp) 27 | target_link_libraries(runTwoNets TrtNet ${OpenCV_LIBS}) 28 | 29 | install(TARGETS classifer DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/install/) 30 | install(TARGETS runNet DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/install/) 31 | install(TARGETS runTwoNets DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/install/) -------------------------------------------------------------------------------- /sample/argsParser.h: -------------------------------------------------------------------------------- 1 | #ifndef __ARGS_PARSER_H_ 2 | #define __ARGS_PARSER_H_ 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace argsParser 12 | { 13 | using std::string; 14 | 15 | enum class P_DATA_TYPE 16 | { 17 | PARSER_BOOL, 18 | PARSER_INT, 19 | PARSER_FLOAT, 20 | PARSER_DOUBLE, 21 | PARSER_STRING 22 | }; 23 | 24 | struct parserInfo 25 | { 26 | string desc; 27 | string defaultValue; 28 | string valueDesc; 29 | 30 | P_DATA_TYPE dataType; 31 | string value; 32 | }; 33 | 34 | typedef string Desc; 35 | typedef string ValueDesc; 36 | typedef string DefaultValue; 37 | 38 | class parser 39 | { 40 | 41 | #define ADD_ARG_FUNCS(DATA_TYPE) \ 42 | static void ADD_ARG_##DATA_TYPE(string name,Desc desc,DefaultValue defaultValue,ValueDesc valueDesc =""){ \ 43 | InnerInitArgs(name,desc,defaultValue,valueDesc,P_DATA_TYPE::PARSER_##DATA_TYPE); \ 44 | } 45 | 46 | public: 47 | static void InnerInitArgs(string name,Desc desc,DefaultValue defaultValue,ValueDesc valueDesc,P_DATA_TYPE dataType) 48 | { 49 | mArgs.emplace(std::make_pair(name, parserInfo{desc,defaultValue,valueDesc, dataType ,defaultValue})); 50 | } 51 | 52 | ADD_ARG_FUNCS(INT); 53 | ADD_ARG_FUNCS(FLOAT); 54 | ADD_ARG_FUNCS(DOUBLE); 55 | ADD_ARG_FUNCS(STRING); 56 | ADD_ARG_FUNCS(BOOL); 57 | 58 | static void printDesc() 59 | { 60 | for (const auto& data :mArgs ) 61 | { 62 | string name = data.first; 63 | auto& info = data.second; 64 | 65 | if(info.valueDesc.length() > 0) 66 | name += "=<" + info.valueDesc + ">"; 67 | 68 | std::cout << std::left << std::setw(20) << name; 69 | std::cout << std::setw(2) << "=" << std::setw(2); 70 | std::cout << std::left << std::setw(80) << info.desc + "(default:" + info.defaultValue + ")"; 71 | std::cout << std::endl; 72 | } 73 | } 74 | 75 | static void parseArgs(int argc,char** argv) 76 | { 77 | string* str_argvs = new string[argc]; 78 | for(int i = 0;i mArgs; 137 | }; 138 | 139 | std::map parser::mArgs ; 140 | }; 141 | 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /sample/configs.h: -------------------------------------------------------------------------------- 1 | #ifndef _CONFIGS_H_ 2 | #define _CONFIGS_H_ 3 | 4 | #include 5 | namespace Tn 6 | { 7 | //src input image size 8 | static const int INPUT_CHANNEL = 3; 9 | static const int INPUT_WIDTH = 608; 10 | static const int INPUT_HEIGHT = 608; 11 | static const int RESIZE_H = 256; 12 | static const int RESIZE_W = 256; 13 | static const float SCALE = 0.017f; 14 | static const char* MEAN_VALUE = "103.94,116.78,123.68"; 15 | 16 | //input data 17 | static const char* INPUT_PROTOTXT = "alexnet.prototxt"; 18 | static const char* INPUT_CAFFEMODEL = "alexnet.caffemodel"; 19 | static const std::string INPUT_IMAGE = "test.jpg"; 20 | static const char* EVAL_LIST = ""; 21 | static const char* CALIBRATION_LIST = ""; 22 | static const char* MODE = "fp32"; 23 | static const char* OUTPUTS = "prob"; 24 | 25 | static const int ITER_TIMES = 1000; 26 | } 27 | #endif -------------------------------------------------------------------------------- /sample/dataReader.cpp: -------------------------------------------------------------------------------- 1 | #include "dataReader.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | namespace Tn 10 | { 11 | list readFileList(const string& fileName) 12 | { 13 | ifstream file(fileName); 14 | if(!file.is_open()) 15 | { 16 | cout << "read file list error,please check file :" << fileName << endl; 17 | exit(-1); 18 | } 19 | 20 | string strLine; 21 | list files; 22 | while( getline(file,strLine) ) 23 | files.push_back(strLine); 24 | 25 | file.close(); 26 | 27 | return files; 28 | } 29 | 30 | list readLabelFileList(const string& fileName) 31 | { 32 | ifstream file(fileName); 33 | if(!file.is_open()) 34 | { 35 | cout << "read file list error,please check file :" << fileName << endl; 36 | exit(-1); 37 | } 38 | 39 | string strLine; 40 | list result; 41 | while(!file.eof()) 42 | { 43 | Source data; 44 | file >> data.fileName >> data.label; 45 | result.emplace_back(data); 46 | } 47 | 48 | return result; 49 | } 50 | 51 | vector split(const string& str, char delim) 52 | { 53 | stringstream ss(str); 54 | string token; 55 | vector container; 56 | while (getline(ss, token, delim)) { 57 | container.push_back(token); 58 | } 59 | 60 | return container; 61 | } 62 | 63 | // vector split(string str, string pat) 64 | // { 65 | // vector bufStr; 66 | // while (true) 67 | // { 68 | // int index = str.find(pat); 69 | // string subStr = str.substr(0, index); 70 | // if (!subStr.empty()) 71 | // bufStr.push_back(subStr); 72 | // str.erase(0, index + pat.size()); 73 | // if (index == -1) 74 | // break; 75 | // } 76 | // return bufStr; 77 | // } 78 | 79 | std::tuple, std::list>> readObjectLabelFileList(const string& fileName) 80 | { 81 | list fileList; 82 | list> bBoxes; 83 | 84 | ifstream file(fileName); 85 | if(!file.is_open()) 86 | { 87 | cout << "read file list error,please check file :" << fileName << endl; 88 | exit(-1); 89 | } 90 | 91 | string strLine; 92 | while( getline(file,strLine) ) 93 | { 94 | vector line=split(strLine, '\n'); 95 | if(line.size() < 1) 96 | continue; 97 | vector strs=split(line[0], ' '); 98 | 99 | int idx = 0; 100 | string dataName=strs[idx++]; 101 | 102 | int trueBoxCount = (strs.size() - 1)/2; 103 | vector truthboxes; 104 | truthboxes.reserve(trueBoxCount); 105 | for (int i = 0 ;i < trueBoxCount ;++i) 106 | { 107 | //class 108 | string classId = strs[idx++]; 109 | 110 | //bbox Length 111 | int length = strs[idx].length(); 112 | //remove bracket [ ] 113 | string bbox = strs[idx++].substr(1,length-2); 114 | 115 | vector strs_txt = split(bbox, ','); 116 | Bbox truthbox; 117 | truthbox.classId = stoi(classId); 118 | truthbox.left = stof(strs_txt[0]); 119 | truthbox.top = stof(strs_txt[1]); 120 | truthbox.right = truthbox.left + stof(strs_txt[2]); 121 | truthbox.bot = truthbox.top + stof(strs_txt[3]); 122 | 123 | truthboxes.push_back(truthbox); 124 | } 125 | 126 | fileList.emplace_back(dataName); 127 | bBoxes.emplace_back(truthboxes); 128 | } 129 | 130 | file.close(); 131 | 132 | return make_tuple(move(fileList),move(bBoxes)); 133 | } 134 | } -------------------------------------------------------------------------------- /sample/dataReader.h: -------------------------------------------------------------------------------- 1 | #ifndef _DATA_READER_H_ 2 | #define _DATA_READER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace Tn 10 | { 11 | std::list readFileList(const std::string& fileName); 12 | 13 | struct Source 14 | { 15 | std::string fileName; 16 | int label; 17 | }; 18 | std::list readLabelFileList(const std::string& fileName); 19 | 20 | struct Bbox 21 | { 22 | int classId; 23 | int left; 24 | int right; 25 | int top; 26 | int bot; 27 | float score; 28 | }; 29 | //[lst,lst] 30 | std::tuple, std::list>> readObjectLabelFileList(const std::string& fileName); 31 | } 32 | 33 | #endif -------------------------------------------------------------------------------- /sample/eval.cpp: -------------------------------------------------------------------------------- 1 | #include "eval.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | namespace Tn 11 | { 12 | float evalTopResult(list>& result,list& groundTruth,int* TP /*= nullptr*/,int* FP /*= nullptr*/,int topK /*= 1*/) 13 | { 14 | int _TP = TP ? *TP: 0; 15 | int _FP = FP ? *FP: 0; 16 | 17 | assert(result.size() == groundTruth.size()); 18 | 19 | auto pRe = result.begin(); 20 | auto pGT = groundTruth.begin(); 21 | for (; pRe != result.end() && pGT != groundTruth.end(); 22 | ++pRe, ++pGT) 23 | { 24 | auto& labels = *pRe; 25 | int truthClass = *pGT; 26 | float gtProb = labels[truthClass]; 27 | 28 | int biggerCount = 0; 29 | for (auto& prob : labels) 30 | { 31 | if (prob >= gtProb) 32 | ++biggerCount; 33 | } 34 | 35 | biggerCount > topK ? ++_FP : ++_TP; 36 | } 37 | 38 | float accuracy=float(_TP)/(_TP+_FP); 39 | if(TP) *TP =_TP; 40 | if(FP) *FP =_FP; 41 | 42 | cout<<"top " << topK <<" accuracy :"<< setprecision(4) << accuracy << endl; 43 | 44 | return accuracy; 45 | } 46 | 47 | float iou_compute(const Bbox& a,const Bbox& b) 48 | { 49 | int and_right=min(a.right,b.right); 50 | int and_left =max(a.left,b.left); 51 | int and_top =max(a.top,b.top); 52 | int and_bot =min(a.bot,b.bot); 53 | 54 | if ((and_top>and_bot) || (and_left>and_right)) 55 | { 56 | return 0.0f; 57 | } 58 | float sand=(and_right-and_left)*(and_bot-and_top)*1.0f; 59 | float sa=(a.right-a.left)*(a.bot-a.top)*1.0f; 60 | float sb=(b.right-b.left)*(b.bot-b.top)*1.0f; 61 | 62 | float iou=sand/(sa+sb-sand); 63 | return iou; 64 | } 65 | 66 | float evalMAPResult(const list>& bboxesList,const list>& truthboxesList,int classNum,float iouThresh) 67 | { 68 | assert(bboxesList.size() == truthboxesList.size()); 69 | cout << "evalMAPResult:" << endl; 70 | 71 | float* precision = new float[classNum]; 72 | float* recall = new float[classNum]; 73 | float* AP = new float[classNum]; 74 | 75 | vector **detBox = nullptr; 76 | vector **truthBox = nullptr; 77 | 78 | int sampleCount = bboxesList.size(); 79 | detBox = new vector* [sampleCount]; 80 | truthBox = new vector* [sampleCount]; 81 | for (int i = 0 ;i < sampleCount ; ++ i) 82 | { 83 | detBox[i] = new vector[classNum]{}; 84 | truthBox[i] = new vector[classNum]{}; 85 | } 86 | 87 | auto pBoxIter = bboxesList.begin(); 88 | auto pTrueIter = truthboxesList.begin(); 89 | for (int i = 0;i< sampleCount;++i , ++pBoxIter , ++pTrueIter) 90 | { 91 | for (const auto& item : *pBoxIter) 92 | detBox[i][item.classId].push_back(item); 93 | 94 | for (const auto& item : *pTrueIter) 95 | truthBox[i][item.classId].push_back(item); 96 | } 97 | 98 | for (int i = 0;i < classNum; ++ i) 99 | { 100 | using CheckPair = pair; 101 | vector< CheckPair > checkPRBoxs; 102 | int FN = 0; 103 | for (int j = 0;j< sampleCount;++j) 104 | { 105 | auto& dboxes = detBox[j][i]; 106 | auto& tboxes = truthBox[j][i]; 107 | 108 | auto checkTBoxes = tboxes; 109 | for (const auto& item: dboxes) 110 | { 111 | int maxIdx = -1; 112 | float maxIou = 0; 113 | 114 | for (const auto& tItem: checkTBoxes) 115 | { 116 | float iou=iou_compute(item,tItem); 117 | //std::cout << "iou" << iou << std::endl; 118 | if(iou > maxIou) 119 | { 120 | maxIdx = &tItem - &checkTBoxes[0]; 121 | maxIou = iou; 122 | } 123 | } 124 | 125 | if(maxIou > iouThresh) 126 | { 127 | checkPRBoxs.push_back({item,true}); 128 | checkTBoxes.erase(checkTBoxes.begin() + maxIdx); 129 | } 130 | else 131 | { 132 | //FP 133 | checkPRBoxs.push_back({item,false}); 134 | } 135 | } 136 | //FN 137 | FN += checkTBoxes.size(); 138 | } 139 | 140 | float TP = count_if(checkPRBoxs.begin(), checkPRBoxs.end(), [](CheckPair& item){return item.second == true;} ); 141 | 142 | int total = checkPRBoxs.size(); 143 | if(total == 0) 144 | { 145 | AP[i] = 1; 146 | continue; 147 | } 148 | 149 | //recall: 150 | recall[i] = (std::fabs(TP + FN) < 1e-5) ? 1 : TP / (TP + FN); 151 | //precision 152 | precision[i] = TP / total;//total is TP+FP 153 | 154 | //compute AP: 155 | sort(checkPRBoxs.begin(),checkPRBoxs.end(),[](const CheckPair& left,const CheckPair& right){ 156 | return left.first.score > right.first.score; 157 | } 158 | ); 159 | 160 | int PR_TP = 0; 161 | int PR_FP = 0; 162 | vector< pair > PRValues; // 163 | for (const auto& item : checkPRBoxs) 164 | { 165 | item.second ? ++PR_TP : ++PR_FP; 166 | PRValues.emplace_back( make_pair(PR_TP/ float(PR_TP+PR_FP) , PR_TP / float(TP + FN)) ); 167 | } 168 | 169 | float sum = PRValues[0].first * PRValues[0].second; 170 | 171 | for (unsigned int m = 0; m < PRValues.size()-1;++m) 172 | { 173 | float w = PRValues[m + 1].second - PRValues[m].second ; 174 | float h = PRValues[m + 1].first; 175 | sum += w*h; 176 | } 177 | 178 | AP[i] = sum; 179 | 180 | cout<< setprecision(4) << "class:" << std::setw(3) << i 181 | << " iou thresh-" << iouThresh 182 | << " AP:" << std::setw(7) << AP[i] 183 | << " recall:" << std::setw(7) << recall[i] 184 | << " precision:" << std::setw(7) << precision[i] << endl; 185 | } 186 | 187 | float sumAp = 0; 188 | for (int i = 0;i < classNum;++i) 189 | sumAp += AP[i]; 190 | 191 | float MAP = sumAp / classNum; 192 | cout<< "MAP:" << MAP << endl; 193 | 194 | if (precision) 195 | delete[] precision; 196 | if (recall) 197 | delete[] recall; 198 | if (AP) 199 | delete[] AP; 200 | 201 | for (int i = 0;i < sampleCount; ++i) 202 | { 203 | delete[] detBox[i]; 204 | delete[] truthBox[i]; 205 | } 206 | 207 | delete[] detBox; 208 | delete[] truthBox; 209 | 210 | return MAP; 211 | } 212 | } -------------------------------------------------------------------------------- /sample/eval.h: -------------------------------------------------------------------------------- 1 | #ifndef _EVAL_H_ 2 | #define _EVAL_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "dataReader.h" 8 | 9 | namespace Tn 10 | { 11 | float evalTopResult(std::list>& result,std::list& groundTruth,int* Tp = nullptr,int* FP = nullptr,int topK = 1); 12 | float evalMAPResult(const std::list>& bboxesList,const std::list> & truthboxesList,int classNum,float iouThresh); 13 | } 14 | 15 | #endif -------------------------------------------------------------------------------- /sample/evalClassifNet.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../code/include/TrtNet.h" 6 | #include "argsParser.h" 7 | #include "dataReader.h" 8 | #include "eval.h" 9 | #include "configs.h" 10 | 11 | using namespace std; 12 | using namespace argsParser; 13 | using namespace Tn; 14 | 15 | vector split(const string& str, char delim) 16 | { 17 | stringstream ss(str); 18 | string token; 19 | vector container; 20 | while (getline(ss, token, delim)) { 21 | container.push_back(token); 22 | } 23 | 24 | return container; 25 | } 26 | 27 | cv::Mat GetMeanMat() 28 | { 29 | using namespace cv; 30 | static std::unique_ptr MeanMat = nullptr; 31 | if (MeanMat.get() != nullptr) 32 | return *MeanMat; 33 | 34 | int h = parser::getIntValue("H"); 35 | int w = parser::getIntValue("W"); 36 | int c = parser::getIntValue("C"); 37 | 38 | string meanStr = parser::getStringValue("meanValue"); 39 | auto meansValues = split(meanStr,','); 40 | float scale = parser::getFloatValue("scale"); 41 | 42 | assert(meansValues.size() == c); 43 | vector means(c); 44 | for (int i = 0 ;i preprocess(const string& fileName) 55 | { 56 | using namespace cv; 57 | 58 | Mat img = imread(fileName); 59 | 60 | if(img.data== nullptr) 61 | { 62 | std::cout << "can not open image :" << fileName << std::endl; 63 | return {}; 64 | } 65 | 66 | int channel = parser::getIntValue("C"); 67 | 68 | //channel 69 | Mat sample; 70 | if (img.channels() == 3 && channel == 1) 71 | cvtColor(img, sample, cv::COLOR_BGR2GRAY); 72 | else if (img.channels() == 4 && channel == 1) 73 | cvtColor(img, sample, cv::COLOR_BGRA2GRAY); 74 | else if (img.channels() == 4 && channel == 3) 75 | cvtColor(img, sample, cv::COLOR_BGRA2BGR); 76 | else if (img.channels() == 1 && channel == 3) 77 | cvtColor(img, sample, cv::COLOR_GRAY2BGR); 78 | else 79 | sample = img; 80 | 81 | //resize 82 | int r_h = parser::getIntValue("RH"); 83 | int r_w = parser::getIntValue("RW"); 84 | cv::Mat resized; 85 | cv::resize(sample, resized, cv::Size(r_h,r_w)); 86 | 87 | //crop 88 | int h = parser::getIntValue("H"); 89 | int w = parser::getIntValue("W"); 90 | int h_off = 0; 91 | int w_off = 0; 92 | h_off = (r_h - h) / 2; 93 | w_off = (r_w - w) / 2; 94 | Rect roi(w_off, h_off, w, h); 95 | Mat croppedImg = resized(roi); 96 | 97 | //to float and scale 98 | cv::Mat img_float; 99 | float scale = parser::getFloatValue("scale"); 100 | if (channel == 3) 101 | croppedImg.convertTo(img_float, CV_32FC3, scale); 102 | else 103 | croppedImg.convertTo(img_float, CV_32FC1, scale); 104 | 105 | 106 | // //mean mat 107 | auto meanFile = GetMeanMat(); 108 | Mat subMeanImg; 109 | cv::subtract(img_float, meanFile, subMeanImg); 110 | 111 | //HWC TO CHW 112 | vector input_channels(channel); 113 | cv::split(subMeanImg, input_channels.data()); 114 | 115 | vector result(h*w*channel); 116 | auto data = result.data(); 117 | int channelLength = h * w; 118 | for (int i = 0; i < channel; ++i) { 119 | memcpy(data,input_channels[i].data,channelLength*sizeof(float)); 120 | data += channelLength; 121 | } 122 | 123 | return result; 124 | } 125 | 126 | int main( int argc, char* argv[] ) 127 | { 128 | parser::ADD_ARG_INT("C",Desc("channel"),DefaultValue(to_string(INPUT_CHANNEL))); 129 | parser::ADD_ARG_INT("H",Desc("height"),DefaultValue(to_string(INPUT_HEIGHT))); 130 | parser::ADD_ARG_INT("W",Desc("width"),DefaultValue(to_string(INPUT_WIDTH))); 131 | parser::ADD_ARG_INT("RH",Desc("image process resized Height"),DefaultValue(to_string(RESIZE_H))); 132 | parser::ADD_ARG_INT("RW",Desc("image process resized Width"),DefaultValue(to_string(RESIZE_W))); 133 | parser::ADD_ARG_FLOAT("scale",Desc("image process scale"),DefaultValue(to_string(SCALE))); 134 | parser::ADD_ARG_STRING("meanValue",Desc("image mean value before scale"),DefaultValue(MEAN_VALUE)); 135 | 136 | parser::ADD_ARG_STRING("caffemodel",Desc("input caffemodel"),DefaultValue(INPUT_CAFFEMODEL),ValueDesc("file")); 137 | parser::ADD_ARG_STRING("prototxt",Desc("input deploy"),DefaultValue(INPUT_PROTOTXT),ValueDesc("file")); 138 | parser::ADD_ARG_STRING("evallist",Desc("load test files from list"),DefaultValue(EVAL_LIST),ValueDesc("file")); 139 | parser::ADD_ARG_STRING("calib",Desc("load calibration files from list"),DefaultValue(CALIBRATION_LIST),ValueDesc("file")); 140 | parser::ADD_ARG_STRING("outputs",Desc("output nodes name"),DefaultValue(OUTPUTS)); 141 | parser::ADD_ARG_STRING("mode",Desc("runtime mode"),DefaultValue(MODE), ValueDesc("fp32/fp16/int8")); 142 | 143 | if(argc < 2){ 144 | parser::printDesc(); 145 | exit(-1); 146 | } 147 | 148 | parser::parseArgs(argc,argv); 149 | 150 | vector> calibData; 151 | string calibFileList = parser::getStringValue("calib"); 152 | string mode = parser::getStringValue("mode"); 153 | if(calibFileList.length() > 0 && mode == "int8") 154 | { 155 | cout << "find calibration file,loading ..." << endl; 156 | 157 | ifstream file(calibFileList); 158 | if(!file.is_open()) 159 | { 160 | cout << "read file list error,please check file :" << calibFileList << endl; 161 | exit(-1); 162 | } 163 | 164 | string strLine; 165 | while( getline(file,strLine) ) 166 | { 167 | //std::cout << strLine << std::endl; 168 | auto data = preprocess(strLine); 169 | calibData.emplace_back(data); 170 | } 171 | file.close(); 172 | } 173 | 174 | string deployFile = parser::getStringValue("prototxt"); 175 | string caffemodelFile = parser::getStringValue("caffemodel"); 176 | string outputNodes = parser::getStringValue("outputs"); 177 | auto outputNames = split(outputNodes,','); 178 | 179 | trtNet net(deployFile,caffemodelFile,outputNames,calibData); 180 | 181 | int outputCount = net.getOutputSize()/sizeof(float); 182 | std::unique_ptr outputData(new float[outputCount]); 183 | 184 | list> outputs; 185 | list groundTruth; 186 | string listFile = parser::getStringValue("evallist"); 187 | 188 | cout << "loading process list from " << listFile << endl; 189 | list inputs = readLabelFileList(listFile); 190 | 191 | int tp1 = 0,fp1 =0; 192 | int tp5 = 0,fp5 =0; 193 | const int printInterval = 500; 194 | int i = 0; 195 | 196 | for (const auto& source :inputs) 197 | { 198 | 199 | std::cout << "process: " << source.fileName << std::endl; 200 | vector inputData = preprocess(source.fileName); 201 | if (!inputData.data()) 202 | continue; 203 | 204 | net.doInference(inputData.data(), outputData.get()); 205 | 206 | //Get Output 207 | auto output = outputData.get(); 208 | 209 | vector res(output,&output[outputCount]); 210 | outputs.emplace_back(res); 211 | groundTruth.push_back(source.label); 212 | 213 | if(++i % printInterval == 0) 214 | { 215 | evalTopResult(outputs,groundTruth,&tp1,&fp1,1); 216 | evalTopResult(outputs,groundTruth,&tp5,&fp5,5); 217 | 218 | outputs.clear(); 219 | groundTruth.clear(); 220 | } 221 | } 222 | 223 | evalTopResult(outputs,groundTruth,&tp1,&fp1,1); 224 | evalTopResult(outputs,groundTruth,&tp5,&fp5,5); 225 | 226 | net.printTime(); 227 | 228 | return 0; 229 | } 230 | 231 | -------------------------------------------------------------------------------- /sample/runNet.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../code/include/TrtNet.h" 6 | #include "argsParser.h" 7 | #include "configs.h" 8 | 9 | using namespace std; 10 | using namespace argsParser; 11 | using namespace Tn; 12 | 13 | unique_ptr prepareImage(const string& fileName) 14 | { 15 | using namespace cv; 16 | 17 | Mat img = imread(fileName); 18 | if(img.data== nullptr) 19 | { 20 | std::cout << "can not open image :" << fileName << std::endl; 21 | return std::unique_ptr(nullptr); 22 | } 23 | 24 | int c = parser::getIntValue("C"); 25 | int h = parser::getIntValue("H"); 26 | int w = parser::getIntValue("W"); 27 | 28 | cv::Mat resized; 29 | cv::resize(img, resized, cv::Size(h,w)); 30 | 31 | cv::Mat img_float; 32 | if (c == 3) 33 | resized.convertTo(img_float, CV_32FC3); 34 | else 35 | resized.convertTo(img_float, CV_32FC1); 36 | 37 | //HWC TO CHW 38 | cv::Mat input_channels[c]; 39 | cv::split(img_float, input_channels); 40 | 41 | float * data = new float[h*w*c]; 42 | auto result = data; 43 | int channelLength = h * w; 44 | for (int i = 0; i < c; ++i) { 45 | memcpy(data,input_channels[i].data,channelLength*sizeof(float)); 46 | data += channelLength; 47 | } 48 | 49 | return std::unique_ptr(result); 50 | } 51 | 52 | int main( int argc, char* argv[] ) 53 | { 54 | parser::ADD_ARG_FLOAT("prototxt",Desc("input deploy"),DefaultValue(INPUT_PROTOTXT),ValueDesc("file")); 55 | parser::ADD_ARG_FLOAT("caffemodel",Desc("input caffemodel"),DefaultValue(INPUT_CAFFEMODEL),ValueDesc("file")); 56 | parser::ADD_ARG_STRING("input",Desc("input image file"),DefaultValue(INPUT_IMAGE),ValueDesc("file")); 57 | parser::ADD_ARG_INT("C",Desc("channel"),DefaultValue(to_string(INPUT_CHANNEL))); 58 | parser::ADD_ARG_INT("H",Desc("height"),DefaultValue(to_string(INPUT_HEIGHT))); 59 | parser::ADD_ARG_INT("W",Desc("width"),DefaultValue(to_string(INPUT_WIDTH))); 60 | parser::ADD_ARG_INT("iterTimes",Desc("iterations"),DefaultValue(to_string(ITER_TIMES))); 61 | 62 | if(argc < 2){ 63 | parser::printDesc(); 64 | exit(-1); 65 | } 66 | 67 | parser::parseArgs(argc,argv); 68 | 69 | string deployFile = parser::getStringValue("prototxt"); 70 | string caffemodelFile = parser::getStringValue("caffemodel"); 71 | std::vector> calibratorData; 72 | trtNet net(deployFile,caffemodelFile,{"prob"},calibratorData); 73 | 74 | string inputImage = parser::getStringValue("input"); 75 | auto inputData = prepareImage(inputImage); 76 | int outputCount = net.getOutputSize()/sizeof(float); 77 | std::unique_ptr outputData(new float[outputCount]); 78 | 79 | for (int i = 0 ;i 2 | #include 3 | #include 4 | #include 5 | #include "../code/include/TrtNet.h" 6 | #include "argsParser.h" 7 | #include "configs.h" 8 | 9 | using namespace std; 10 | using namespace argsParser; 11 | using namespace Tn; 12 | 13 | unique_ptr prepareImage(const string& fileName) 14 | { 15 | using namespace cv; 16 | 17 | Mat img = imread(fileName); 18 | if(img.data== nullptr) 19 | { 20 | std::cout << "can not open image :" << fileName << std::endl; 21 | return std::unique_ptr(nullptr); 22 | } 23 | 24 | int c = parser::getIntValue("C"); 25 | int h = parser::getIntValue("H"); 26 | int w = parser::getIntValue("W"); 27 | 28 | cv::Mat resized; 29 | cv::resize(img, resized, cv::Size(h,w)); 30 | 31 | cv::Mat img_float; 32 | if (c == 3) 33 | resized.convertTo(img_float, CV_32FC3); 34 | else 35 | resized.convertTo(img_float, CV_32FC1); 36 | 37 | //HWC TO CHW 38 | cv::Mat input_channels[c]; 39 | cv::split(img_float, input_channels); 40 | 41 | float * data = new float[h*w*c]; 42 | auto result = data; 43 | int channelLength = h * w; 44 | for (int i = 0; i < c; ++i) { 45 | memcpy(data,input_channels[i].data,channelLength*sizeof(float)); 46 | data += channelLength; 47 | } 48 | 49 | return std::unique_ptr(result); 50 | } 51 | 52 | int main( int argc, char* argv[] ) 53 | { 54 | parser::ADD_ARG_FLOAT("prototxt",Desc("input deploy"),DefaultValue(INPUT_PROTOTXT),ValueDesc("file")); 55 | parser::ADD_ARG_FLOAT("caffemodel",Desc("input caffemodel"),DefaultValue(INPUT_CAFFEMODEL),ValueDesc("file")); 56 | parser::ADD_ARG_STRING("input",Desc("input image file"),DefaultValue(INPUT_IMAGE),ValueDesc("file")); 57 | parser::ADD_ARG_INT("C",Desc("channel"),DefaultValue(to_string(INPUT_CHANNEL))); 58 | parser::ADD_ARG_INT("H",Desc("height"),DefaultValue(to_string(INPUT_HEIGHT))); 59 | parser::ADD_ARG_INT("W",Desc("width"),DefaultValue(to_string(INPUT_WIDTH))); 60 | parser::ADD_ARG_INT("iterTimes",Desc("iterations"),DefaultValue(to_string(ITER_TIMES))); 61 | 62 | if(argc < 2){ 63 | parser::printDesc(); 64 | exit(-1); 65 | } 66 | 67 | parser::parseArgs(argc,argv); 68 | 69 | string deployFile = parser::getStringValue("prototxt"); 70 | string caffemodelFile = parser::getStringValue("caffemodel"); 71 | std::vector> calibratorData; 72 | trtNet net(deployFile,caffemodelFile,{"prob"},calibratorData); 73 | 74 | //NOTE: test for two nets, but it may crash by some unknown reason. 75 | trtNet net2(deployFile,caffemodelFile,{"prob"},calibratorData); 76 | //Change to another net also will crash 77 | trtNet net3("yolov3.prototxt","yolov3.caffemodel",{"prob"},calibratorData); 78 | 79 | string inputImage = parser::getStringValue("input"); 80 | auto inputData = prepareImage(inputImage); 81 | int outputCount = net.getOutputSize()/sizeof(float); 82 | std::unique_ptr outputData(new float[outputCount]); 83 | 84 | for (int i = 0 ;i