├── .gitignore
├── CMakeLists.txt
├── README.md
├── half.hpp
├── logging.h
├── main.cpp
├── main_trt.cpp
├── mixformer-pyonnx
    └── mf_tracker_ort.py
├── mixformer-pytrt
    ├── mf_tracker_trt.py
    ├── mixformer_construct_trt.py
    ├── mixformer_nvinfer.py
    └── onnx2trt.py
├── mixformer_onnx.cpp
├── mixformer_onnx.h
├── mixformer_trt.cpp
├── mixformer_trt.h
├── test
    ├── test_client.py
    └── test_server.py
└── utils
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | build
3 | data
4 | model
5 | lib
6 | onnxruntime
7 | # mixformer-pytrt
8 | __pycache__
9 | *.mp4


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | 
 4 | option(USE_CUDA "enable ONNXRuntime engine with CUDA provider" ON)
 5 | 
 6 | set(CMAKE_C_COMPILER "gcc")
 7 | set(CMAKE_CXX_COMPILER "g++")
 8 | set(CMAKE_CXX_FLAGS "-fPIC -std=c++14")
 9 | set(CMAKE_C_FLAGS "-fPIC -std=c11  -O3 -mavx512f")
10 | set(CMAKE_BUILD_TYPE "Release")
11 | project(mixformer-onnx)
12 | 
13 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
14 | find_package(CUDA REQUIRED)
15 | 
16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
17 |   message("-- embed_platform on")
18 |   include_directories(/usr/local/cuda/targets/aarch64-linux/include)
19 |   link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
20 | 
21 |   # tensorrt
22 |   include_directories(/usr/include/aarch64-linux-gnu)
23 |   link_libraries(/usr/lib/aarch64-linux-gnu)
24 | else()
25 |   message("-- embed_platform off")
26 |   # cuda
27 |   include_directories(/usr/local/cuda/include)
28 |   link_directories(/usr/local/cuda/lib64)
29 | 
30 |   # tensorrt
31 |   include_directories(/opt/TensorRT/include)
32 |   link_directories(/opt/TensorRT/lib)
33 | 
34 | endif()
35 | 
36 | include_directories(${PROJECT_SOURCE_DIR})
37 | include_directories(/usr/include/opencv4)
38 | include_directories(/home/code/onnxruntime)
39 | 
40 | link_directories(/usr/lib/x86_64-linux-gnu /home/code/onnxruntime/build/Linux/Release)
41 | 
42 | # add_executable(mixformer-onnx main.cpp mixformer_onnx.cpp mixformer_onnx.h)
43 | # target_link_libraries(mixformer-onnx libonnxruntime.so libopencv_core.so libopencv_highgui.so libopencv_imgproc.so libopencv_video.so libopencv_videoio.so libopencv_imgcodecs.so)
44 | 
45 | add_executable(mixformer-trt main_trt.cpp mixformer_trt.cpp mixformer_trt.h logging.h half.hpp)
46 | target_link_libraries(mixformer-trt nvinfer cudart libopencv_core.so libopencv_highgui.so libopencv_imgproc.so libopencv_video.so libopencv_videoio.so libopencv_imgcodecs.so)
47 | 
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MixformerV2 onnx c++ and TensorRT-py version
 2 | MixFormerV2: Efficient Fully Transformer Tracking.
 3 | 
 4 | [official pytorch](https://github.com/MCG-NJU/MixFormerV2.git)
 5 | 
 6 | Here, the mixformerv2 tracking algorithm with onnx and trt is provided, and the fps reaches about 500+fps on the 3080-laptop gpu.
 7 | 
 8 | At the same time, a pytrt and pyort version were also provided, which reached 430fps on the 3080-laptop gpu.
 9 | # 0. Download model
10 | [mixformer_v2.onnx](https://www.123pan.com/s/6iArVv-FYAJ.html)
11 | 
12 | [mixformer_v2_sim.onnx](https://www.123pan.com/s/6iArVv-mcAJ.html)
13 | 
14 | 
15 | # 1. How to build and run it?
16 | Prerequisites: First, download the source code of [onnx v1.10](https://github.com/microsoft/onnxruntime) and compile it. For details, see lite.ai.toolkit. Geting the header file and putting into the onnxruntime folder, the compiled .so file should put into the lib folder. The above two folders are located in Mixformerv2-onnx. However, the above steps are not required for tensorRT inference, you only need to configure TensorRT.
17 | ## modify your own CMakeList.txt
18 | modify onnx path as yours
19 | 
20 | ## build
21 | ```
22 | $ mkdir build && cd build
23 | $ cmake .. && make -j
24 | ```
25 | 
26 | ## run
27 | ```
28 | $ cd /home/code/Mixformerv2-onnx
29 | $ ./mixformer-onnx [model_path] [videopath(file or camera)]
30 | ```
31 | 
32 | # 2. MixformerV2 TensorRT version inference with CPP and python
33 | Assume that you have configured Tensorrt, use onnx2trt to convert the onnx model to engine on your GPU platform, and then start compilation and execution.
34 | 
35 | ## cpp version 
36 | build and run
37 | ```
38 | $ cd Mixformerv2-onnx
39 | $ mkdir build && cd build
40 | $ cmake .. && make
41 | & ./mixformer-trt ../model/mixformer_v2_sim.engine ../target.mp4
42 | ```
43 | ## python version
44 | Modify the video path in Mixformerv2-onnx/mixformer-pytrt/mf_tracker_trt.py，and mkdir model file_dir, then download the onnx file and put onnx file into file_dir.
45 | ```
46 | $ cd Mixformerv2-onnx
47 | & python mixformer-pytrt/onnx2trt.py 
48 | $ python mixformer-pytrt/mf_tracker_trt.py
49 | ```
50 | Note: In addition to simplification when converting the onnx model, it is important to ensure that the shape of the data input to the engine model and the corresponding underlying data are continuous.
51 | 
52 | # Acknowledgments
53 | 
54 | Thanks for the [LightTrack-ncnn](https://github.com/Z-Xiong/LightTrack-ncnn.git) and [lite.ai.tookit](https://github.com/DefTruth/lite.ai.toolkit), which helps us to quickly implement our ideas.
55 | 


--------------------------------------------------------------------------------
/logging.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef TENSORRT_LOGGING_H
 18 | #define TENSORRT_LOGGING_H
 19 | 
 20 | #include "NvInferRuntimeCommon.h"
 21 | #include <cassert>
 22 | #include <ctime>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <ostream>
 26 | #include <sstream>
 27 | #include <string>
 28 | 
 29 | using Severity = nvinfer1::ILogger::Severity;
 30 | 
 31 | class LogStreamConsumerBuffer : public std::stringbuf
 32 | {
 33 | public:
 34 |     LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
 35 |         : mOutput(stream)
 36 |         , mPrefix(prefix)
 37 |         , mShouldLog(shouldLog)
 38 |     {
 39 |     }
 40 | 
 41 |     LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
 42 |         : mOutput(other.mOutput)
 43 |     {
 44 |     }
 45 | 
 46 |     ~LogStreamConsumerBuffer()
 47 |     {
 48 |         // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
 49 |         // std::streambuf::pptr() gives a pointer to the current position of the output sequence
 50 |         // if the pointer to the beginning is not equal to the pointer to the current position,
 51 |         // call putOutput() to log the output to the stream
 52 |         if (pbase() != pptr())
 53 |         {
 54 |             putOutput();
 55 |         }
 56 |     }
 57 | 
 58 |     // synchronizes the stream buffer and returns 0 on success
 59 |     // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
 60 |     // resetting the buffer and flushing the stream
 61 |     virtual int sync()
 62 |     {
 63 |         putOutput();
 64 |         return 0;
 65 |     }
 66 | 
 67 |     void putOutput()
 68 |     {
 69 |         if (mShouldLog)
 70 |         {
 71 |             // prepend timestamp
 72 |             std::time_t timestamp = std::time(nullptr);
 73 |             tm* tm_local = std::localtime(&timestamp);
 74 |             std::cout << "[";
 75 |             std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
 76 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
 77 |             std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
 78 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
 79 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
 80 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
 81 |             // std::stringbuf::str() gets the string contents of the buffer
 82 |             // insert the buffer contents pre-appended by the appropriate prefix into the stream
 83 |             mOutput << mPrefix << str();
 84 |             // set the buffer to empty
 85 |             str("");
 86 |             // flush the stream
 87 |             mOutput.flush();
 88 |         }
 89 |     }
 90 | 
 91 |     void setShouldLog(bool shouldLog)
 92 |     {
 93 |         mShouldLog = shouldLog;
 94 |     }
 95 | 
 96 | private:
 97 |     std::ostream& mOutput;
 98 |     std::string mPrefix;
 99 |     bool mShouldLog;
100 | };
101 | 
102 | //!
103 | //! \class LogStreamConsumerBase
104 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
105 | //!
106 | class LogStreamConsumerBase
107 | {
108 | public:
109 |     LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
110 |         : mBuffer(stream, prefix, shouldLog)
111 |     {
112 |     }
113 | 
114 | protected:
115 |     LogStreamConsumerBuffer mBuffer;
116 | };
117 | 
118 | //!
119 | //! \class LogStreamConsumer
120 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
121 | //!  Order of base classes is LogStreamConsumerBase and then std::ostream.
122 | //!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
123 | //!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
124 | //!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
125 | //!  Please do not change the order of the parent classes.
126 | //!
127 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
128 | {
129 | public:
130 |     //! \brief Creates a LogStreamConsumer which logs messages with level severity.
131 |     //!  Reportable severity determines if the messages are severe enough to be logged.
132 |     LogStreamConsumer(Severity reportableSeverity, Severity severity)
133 |         : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
134 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
135 |         , mShouldLog(severity <= reportableSeverity)
136 |         , mSeverity(severity)
137 |     {
138 |     }
139 | 
140 |     LogStreamConsumer(LogStreamConsumer&& other)
141 |         : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
142 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
143 |         , mShouldLog(other.mShouldLog)
144 |         , mSeverity(other.mSeverity)
145 |     {
146 |     }
147 | 
148 |     void setReportableSeverity(Severity reportableSeverity)
149 |     {
150 |         mShouldLog = mSeverity <= reportableSeverity;
151 |         mBuffer.setShouldLog(mShouldLog);
152 |     }
153 | 
154 | private:
155 |     static std::ostream& severityOstream(Severity severity)
156 |     {
157 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
158 |     }
159 | 
160 |     static std::string severityPrefix(Severity severity)
161 |     {
162 |         switch (severity)
163 |         {
164 |         case Severity::kINTERNAL_ERROR: return "[F] ";
165 |         case Severity::kERROR: return "[E] ";
166 |         case Severity::kWARNING: return "[W] ";
167 |         case Severity::kINFO: return "[I] ";
168 |         case Severity::kVERBOSE: return "[V] ";
169 |         default: assert(0); return "";
170 |         }
171 |     }
172 | 
173 |     bool mShouldLog;
174 |     Severity mSeverity;
175 | };
176 | 
177 | //! \class Logger
178 | //!
179 | //! \brief Class which manages logging of TensorRT tools and samples
180 | //!
181 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
182 | //! and supports logging two types of messages:
183 | //!
184 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
185 | //! - Test pass/fail messages
186 | //!
187 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
188 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
189 | //!
190 | //! In the future, this class could be extended to support dumping test results to a file in some standard format
191 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
192 | //!
193 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
194 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
195 | //! library and messages coming from the sample.
196 | //!
197 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
198 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
199 | //! object.
200 | 
201 | class Logger : public nvinfer1::ILogger
202 | {
203 | public:
204 |     Logger(Severity severity = Severity::kWARNING)
205 |         : mReportableSeverity(severity)
206 |     {
207 |     }
208 | 
209 |     //!
210 |     //! \enum TestResult
211 |     //! \brief Represents the state of a given test
212 |     //!
213 |     enum class TestResult
214 |     {
215 |         kRUNNING, //!< The test is running
216 |         kPASSED,  //!< The test passed
217 |         kFAILED,  //!< The test failed
218 |         kWAIVED   //!< The test was waived
219 |     };
220 | 
221 |     //!
222 |     //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
223 |     //! \return The nvinfer1::ILogger associated with this Logger
224 |     //!
225 |     //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
226 |     //! we can eliminate the inheritance of Logger from ILogger
227 |     //!
228 |     nvinfer1::ILogger& getTRTLogger()
229 |     {
230 |         return *this;
231 |     }
232 | 
233 |     //!
234 |     //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
235 |     //!
236 |     //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
237 |     //! inheritance from nvinfer1::ILogger
238 |     //!
239 |     void log(Severity severity, const char* msg) noexcept override
240 |     {
241 |         LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
242 |     }
243 | 
244 |     //!
245 |     //! \brief Method for controlling the verbosity of logging output
246 |     //!
247 |     //! \param severity The logger will only emit messages that have severity of this level or higher.
248 |     //!
249 |     void setReportableSeverity(Severity severity)
250 |     {
251 |         mReportableSeverity = severity;
252 |     }
253 | 
254 |     //!
255 |     //! \brief Opaque handle that holds logging information for a particular test
256 |     //!
257 |     //! This object is an opaque handle to information used by the Logger to print test results.
258 |     //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
259 |     //! with Logger::reportTest{Start,End}().
260 |     //!
261 |     class TestAtom
262 |     {
263 |     public:
264 |         TestAtom(TestAtom&&) = default;
265 | 
266 |     private:
267 |         friend class Logger;
268 | 
269 |         TestAtom(bool started, const std::string& name, const std::string& cmdline)
270 |             : mStarted(started)
271 |             , mName(name)
272 |             , mCmdline(cmdline)
273 |         {
274 |         }
275 | 
276 |         bool mStarted;
277 |         std::string mName;
278 |         std::string mCmdline;
279 |     };
280 | 
281 |     //!
282 |     //! \brief Define a test for logging
283 |     //!
284 |     //! \param[in] name The name of the test.  This should be a string starting with
285 |     //!                  "TensorRT" and containing dot-separated strings containing
286 |     //!                  the characters [A-Za-z0-9_].
287 |     //!                  For example, "TensorRT.sample_googlenet"
288 |     //! \param[in] cmdline The command line used to reproduce the test
289 |     //
290 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
291 |     //!
292 |     static TestAtom defineTest(const std::string& name, const std::string& cmdline)
293 |     {
294 |         return TestAtom(false, name, cmdline);
295 |     }
296 | 
297 |     //!
298 |     //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
299 |     //!        as input
300 |     //!
301 |     //! \param[in] name The name of the test
302 |     //! \param[in] argc The number of command-line arguments
303 |     //! \param[in] argv The array of command-line arguments (given as C strings)
304 |     //!
305 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
306 |     static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
307 |     {
308 |         auto cmdline = genCmdlineString(argc, argv);
309 |         return defineTest(name, cmdline);
310 |     }
311 | 
312 |     //!
313 |     //! \brief Report that a test has started.
314 |     //!
315 |     //! \pre reportTestStart() has not been called yet for the given testAtom
316 |     //!
317 |     //! \param[in] testAtom The handle to the test that has started
318 |     //!
319 |     static void reportTestStart(TestAtom& testAtom)
320 |     {
321 |         reportTestResult(testAtom, TestResult::kRUNNING);
322 |         assert(!testAtom.mStarted);
323 |         testAtom.mStarted = true;
324 |     }
325 | 
326 |     //!
327 |     //! \brief Report that a test has ended.
328 |     //!
329 |     //! \pre reportTestStart() has been called for the given testAtom
330 |     //!
331 |     //! \param[in] testAtom The handle to the test that has ended
332 |     //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
333 |     //!                   TestResult::kFAILED, TestResult::kWAIVED
334 |     //!
335 |     static void reportTestEnd(const TestAtom& testAtom, TestResult result)
336 |     {
337 |         assert(result != TestResult::kRUNNING);
338 |         assert(testAtom.mStarted);
339 |         reportTestResult(testAtom, result);
340 |     }
341 | 
342 |     static int reportPass(const TestAtom& testAtom)
343 |     {
344 |         reportTestEnd(testAtom, TestResult::kPASSED);
345 |         return EXIT_SUCCESS;
346 |     }
347 | 
348 |     static int reportFail(const TestAtom& testAtom)
349 |     {
350 |         reportTestEnd(testAtom, TestResult::kFAILED);
351 |         return EXIT_FAILURE;
352 |     }
353 | 
354 |     static int reportWaive(const TestAtom& testAtom)
355 |     {
356 |         reportTestEnd(testAtom, TestResult::kWAIVED);
357 |         return EXIT_SUCCESS;
358 |     }
359 | 
360 |     static int reportTest(const TestAtom& testAtom, bool pass)
361 |     {
362 |         return pass ? reportPass(testAtom) : reportFail(testAtom);
363 |     }
364 | 
365 |     Severity getReportableSeverity() const
366 |     {
367 |         return mReportableSeverity;
368 |     }
369 | 
370 | private:
371 |     //!
372 |     //! \brief returns an appropriate string for prefixing a log message with the given severity
373 |     //!
374 |     static const char* severityPrefix(Severity severity)
375 |     {
376 |         switch (severity)
377 |         {
378 |         case Severity::kINTERNAL_ERROR: return "[F] ";
379 |         case Severity::kERROR: return "[E] ";
380 |         case Severity::kWARNING: return "[W] ";
381 |         case Severity::kINFO: return "[I] ";
382 |         case Severity::kVERBOSE: return "[V] ";
383 |         default: assert(0); return "";
384 |         }
385 |     }
386 | 
387 |     //!
388 |     //! \brief returns an appropriate string for prefixing a test result message with the given result
389 |     //!
390 |     static const char* testResultString(TestResult result)
391 |     {
392 |         switch (result)
393 |         {
394 |         case TestResult::kRUNNING: return "RUNNING";
395 |         case TestResult::kPASSED: return "PASSED";
396 |         case TestResult::kFAILED: return "FAILED";
397 |         case TestResult::kWAIVED: return "WAIVED";
398 |         default: assert(0); return "";
399 |         }
400 |     }
401 | 
402 |     //!
403 |     //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
404 |     //!
405 |     static std::ostream& severityOstream(Severity severity)
406 |     {
407 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
408 |     }
409 | 
410 |     //!
411 |     //! \brief method that implements logging test results
412 |     //!
413 |     static void reportTestResult(const TestAtom& testAtom, TestResult result)
414 |     {
415 |         severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
416 |                                          << testAtom.mCmdline << std::endl;
417 |     }
418 | 
419 |     //!
420 |     //! \brief generate a command line string from the given (argc, argv) values
421 |     //!
422 |     static std::string genCmdlineString(int argc, char const* const* argv)
423 |     {
424 |         std::stringstream ss;
425 |         for (int i = 0; i < argc; i++)
426 |         {
427 |             if (i > 0)
428 |                 ss << " ";
429 |             ss << argv[i];
430 |         }
431 |         return ss.str();
432 |     }
433 | 
434 |     Severity mReportableSeverity;
435 | };
436 | 
437 | namespace
438 | {
439 | 
440 | //!
441 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
442 | //!
443 | //! Example usage:
444 | //!
445 | //!     LOG_VERBOSE(logger) << "hello world" << std::endl;
446 | //!
447 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
448 | {
449 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
450 | }
451 | 
452 | //!
453 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
454 | //!
455 | //! Example usage:
456 | //!
457 | //!     LOG_INFO(logger) << "hello world" << std::endl;
458 | //!
459 | inline LogStreamConsumer LOG_INFO(const Logger& logger)
460 | {
461 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
462 | }
463 | 
464 | //!
465 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
466 | //!
467 | //! Example usage:
468 | //!
469 | //!     LOG_WARN(logger) << "hello world" << std::endl;
470 | //!
471 | inline LogStreamConsumer LOG_WARN(const Logger& logger)
472 | {
473 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
474 | }
475 | 
476 | //!
477 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
478 | //!
479 | //! Example usage:
480 | //!
481 | //!     LOG_ERROR(logger) << "hello world" << std::endl;
482 | //!
483 | inline LogStreamConsumer LOG_ERROR(const Logger& logger)
484 | {
485 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
486 | }
487 | 
488 | //!
489 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
490 | //         ("fatal" severity)
491 | //!
492 | //! Example usage:
493 | //!
494 | //!     LOG_FATAL(logger) << "hello world" << std::endl;
495 | //!
496 | inline LogStreamConsumer LOG_FATAL(const Logger& logger)
497 | {
498 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
499 | }
500 | 
501 | } // anonymous namespace
502 | 
503 | #endif // TENSORRT_LOGGING_H
504 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <string>
  4 | 
  5 | #include "mixformer_onnx.h"
  6 | 
  7 | void track(Mixformer *tracker, const char *video_path)
  8 | {
  9 |     // Read video.
 10 |     cv::VideoCapture capture;
 11 |     bool ret;
 12 |     if (strlen(video_path)==1)
 13 |         ret = capture.open(atoi(video_path));
 14 |     else
 15 |         ret = capture.open(video_path);
 16 | 
 17 |     // Exit if video not opened.
 18 |     if (!ret)
 19 |         std::cout << "Open cap failed!" << std::endl;
 20 | 
 21 |     // Read first frame.
 22 |     cv::Mat frame;
 23 | 
 24 |     bool ok = capture.read(frame);
 25 |     if (!ok)
 26 |     {
 27 |         std::cout<< "Cannot read video file" << std::endl;
 28 |         return;
 29 |     }
 30 | 
 31 |     // Select a rect.
 32 |     cv::namedWindow("demo");
 33 |     cv::Rect trackWindow = cv::selectROI("demo", frame);
 34 |     // cv::Rect trackWindow(744, 417, 42, 95);
 35 |     
 36 | 
 37 |     // Initialize tracker with first frame and rect.
 38 |     std::cout << "Start track init ..." << std::endl;
 39 |     std::cout << "==========================" << std::endl;
 40 |     DrOBB bbox;
 41 |     bbox.box.x0 = trackWindow.x;
 42 |     bbox.box.x1 = trackWindow.x+trackWindow.width;
 43 |     bbox.box.y0 = trackWindow.y;
 44 |     bbox.box.y1 = trackWindow.y+trackWindow.height;
 45 |     tracker->init(frame, bbox);
 46 |     std::cout << "==========================" << std::endl;
 47 |     std::cout << "Init done!" << std::endl;
 48 |     std::cout << std::endl;
 49 | 
 50 |     int frame_id = 0;
 51 |     double avg_fps = 0.f;
 52 |     for (;;)
 53 |     {
 54 |         // Read a new frame.
 55 |         capture >> frame;
 56 |         if (frame.empty())
 57 |             break;
 58 |         frame_id += 1;
 59 |         // Start timer
 60 |         double t = (double)cv::getTickCount();
 61 | 
 62 |         // Update tracker.
 63 |         DrOBB bbox = tracker->track(frame);
 64 | 
 65 |         // Calculate Frames per second (FPS)
 66 |         double fps = cv::getTickFrequency() / ((double)cv::getTickCount() - t);
 67 |         avg_fps += fps;
 68 | 
 69 |         // Result to rect.
 70 |         cv::Rect rect;
 71 |         rect.x = bbox.box.x0;
 72 |         rect.y = bbox.box.y0;
 73 |         rect.width = int(bbox.box.x1 - bbox.box.x0);
 74 |         rect.height = int(bbox.box.y1 - bbox.box.y0);
 75 | 
 76 |         std::cout << "[x0, y0, w, h]: [" << rect.x << " " << rect.y << " " << rect.width << " " << rect.height << "]" << std::endl;
 77 |         std::cout << "score: " << bbox.score << std::endl;
 78 | 
 79 |         // Boundary judgment.
 80 |         cv::Mat track_window;
 81 |         if (0 <= rect.x && 0 <= rect.width && rect.x + rect.width <= frame.cols && 0 <= rect.y && 0 <= rect.height && rect.y + rect.height <= frame.rows)
 82 |         {
 83 |             cv::rectangle(frame, rect, cv::Scalar(0, 255, 0));
 84 |         }
 85 | 
 86 |         // Display FPS 
 87 |         std::cout << "FPS: " << fps << std::endl;
 88 |         std::cout << "==========================" << std::endl;
 89 |         std::cout << std::endl;
 90 | 
 91 | 
 92 |         // Display result.
 93 |         cv::imshow("demo", frame);
 94 |         cv::waitKey(1);
 95 | 
 96 |         // Exit if 'q' pressed.
 97 |         if (cv::waitKey(1) == 'q')
 98 |         {
 99 |             break;
100 |         }
101 |     }
102 |     std::cout << "AVG_FPS: " << avg_fps / frame_id << std::endl;
103 |     cv::destroyWindow("demo");
104 |     capture.release();
105 | }
106 | 
107 | 
108 | int main(int argc, char** argv)
109 | {
110 |     if (argc != 3)
111 |     {
112 |         fprintf(stderr, "Usage: %s [modelpath] [videopath(file or camera)]\n", argv[0]);
113 |         return -1;
114 |     }
115 | 
116 |     // Get model path.
117 |     const char* model_path = argv[1]; // Mixformer-onnx/model/mixformer_v2.onnx";
118 | 
119 |     // Get video path.
120 |     const char* video_path = argv[2]; 
121 | 
122 |     // Build tracker.
123 |     Mixformer *Mixformerer;
124 |     Mixformerer = new Mixformer(model_path, 8);
125 |     track(Mixformerer, video_path);
126 | 
127 |     return 0;
128 | }


--------------------------------------------------------------------------------
/main_trt.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <string>
  4 | 
  5 | #include "mixformer_trt.h"
  6 | 
  7 | void track(MixformerTRT *tracker, std::string video_path)
  8 | {
  9 |     // Read video.
 10 |     cv::VideoCapture capture;
 11 |     bool ret;
 12 |     // if (strlen(video_path)==1)
 13 |     //     ret = capture.open(atoi(video_path));
 14 |     // else
 15 |     //     ret = capture.open(video_path);
 16 |     ret = capture.open(video_path);
 17 |     // Exit if video not opened.
 18 |     if (!ret)
 19 |         std::cout << "Open cap failed!" << std::endl;
 20 | 
 21 |     // Read first frame.
 22 |     cv::Mat frame;
 23 | 
 24 |     bool ok = capture.read(frame);
 25 |     if (!ok)
 26 |     {
 27 |         std::cout<< "Cannot read video file" << std::endl;
 28 |         return;
 29 |     }
 30 | 
 31 |     // Select a rect.
 32 |     cv::namedWindow("demo");
 33 |     cv::Rect trackWindow = cv::selectROI("demo", frame);
 34 |     // cv::Rect trackWindow(744, 417, 42, 95);
 35 |     
 36 | 
 37 |     // Initialize tracker with first frame and rect.
 38 |     std::cout << "Start track init ..." << std::endl;
 39 |     std::cout << "==========================" << std::endl;
 40 |     DrOBB bbox;
 41 |     bbox.box.x0 = trackWindow.x;
 42 |     bbox.box.x1 = trackWindow.x+trackWindow.width;
 43 |     bbox.box.y0 = trackWindow.y;
 44 |     bbox.box.y1 = trackWindow.y+trackWindow.height;
 45 |     tracker->init(frame, bbox);
 46 |     std::cout << "==========================" << std::endl;
 47 |     std::cout << "Init done!" << std::endl;
 48 |     std::cout << std::endl;
 49 | 
 50 |     int frame_id = 0;
 51 |     double avg_fps = 0.f;
 52 |     for (;;)
 53 |     {
 54 |         // Read a new frame.
 55 |         capture >> frame;
 56 |         if (frame.empty())
 57 |             break;
 58 |         frame_id += 1;
 59 |         // Start timer
 60 |         double t = (double)cv::getTickCount();
 61 | 
 62 |         // Update tracker.
 63 |         DrOBB bbox = tracker->track(frame);
 64 | 
 65 |         // Calculate Frames per second (FPS)
 66 |         double fps = cv::getTickFrequency() / ((double)cv::getTickCount() - t);
 67 |         avg_fps += fps;
 68 | 
 69 |         // Result to rect.
 70 |         cv::Rect rect;
 71 |         rect.x = bbox.box.x0;
 72 |         rect.y = bbox.box.y0;
 73 |         rect.width = int(bbox.box.x1 - bbox.box.x0);
 74 |         rect.height = int(bbox.box.y1 - bbox.box.y0);
 75 | 
 76 |         std::cout << "[x0, y0, w, h]: [" << rect.x << " " << rect.y << " " << rect.width << " " << rect.height << "]" << std::endl;
 77 |         std::cout << "score: " << bbox.score << std::endl;
 78 | 
 79 |         // Boundary judgment.
 80 |         cv::Mat track_window;
 81 |         if (0 <= rect.x && 0 <= rect.width && rect.x + rect.width <= frame.cols && 0 <= rect.y && 0 <= rect.height && rect.y + rect.height <= frame.rows)
 82 |         {
 83 |             cv::rectangle(frame, rect, cv::Scalar(0, 255, 0));
 84 |         }
 85 | 
 86 |         // Display FPS 
 87 |         std::cout << "FPS: " << fps << std::endl;
 88 |         std::cout << "==========================" << std::endl;
 89 |         std::cout << std::endl;
 90 | 
 91 | 
 92 |         // Display result.
 93 |         cv::imshow("demo", frame);
 94 |         cv::waitKey(1);
 95 | 
 96 |         // Exit if 'q' pressed.
 97 |         if (cv::waitKey(1) == 'q')
 98 |         {
 99 |             break;
100 |         }
101 |     }
102 |     std::cout << "AVG_FPS: " << avg_fps / frame_id << std::endl;
103 |     cv::destroyWindow("demo");
104 |     capture.release();
105 | }
106 | 
107 | 
108 | int main(int argc, char** argv)
109 | {
110 |     if (argc != 3)
111 |     {
112 |         fprintf(stderr, "Usage: %s [modelpath] [videopath(file or camera)]\n", argv[0]);
113 |         return -1;
114 |     }
115 | 
116 |     // Get model path.
117 |     std::string model_path = argv[1]; // Mixformer-onnx/model/mixformer_v2.onnx";
118 | 
119 |     // Get video path.
120 |     std::string video_path = argv[2]; 
121 | 
122 |     // Build tracker.
123 |     MixformerTRT *Mixformerer;
124 |     Mixformerer = new MixformerTRT(model_path);
125 |     track(Mixformerer, video_path);
126 | 
127 |     return 0;
128 | }


--------------------------------------------------------------------------------
/mixformer-pyonnx/mf_tracker_ort.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import numpy as np
  5 | import time
  6 | import cv2
  7 | import glob
  8 | import onnxruntime
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import math
 12 | 
 13 | prj_path = os.path.join(os.path.dirname(__file__), '..')
 14 | if prj_path not in sys.path:
 15 |     sys.path.append(prj_path)
 16 | 
 17 | def get_frames(video_name):
 18 |     """获取视频帧
 19 | 
 20 |     Args:
 21 |         video_name (_type_): _description_
 22 | 
 23 |     Yields:
 24 |         _type_: _description_
 25 |     """
 26 |     if not video_name:
 27 |         rtsp = "rtsp://%s:%s@%s:554/cam/realmonitor?channel=1&subtype=1" % ("admin", "123456", "192.168.1.108")
 28 |         cap = cv2.VideoCapture(rtsp) if rtsp else cv2.VideoCapture()
 29 |         
 30 |         # warmup
 31 |         for i in range(5):
 32 |             cap.read()
 33 |         while True:
 34 |             ret, frame = cap.read()
 35 |             if ret:
 36 |                 # print('读取成功===>>>', frame.shape)
 37 |                 yield cv2.resize(frame,(800, 600))
 38 |             else:
 39 |                 break
 40 |     elif video_name.endswith('avi') or \
 41 |         video_name.endswith('mp4'):
 42 |         cap = cv2.VideoCapture(video_name)
 43 |         while True:
 44 |             ret, frame = cap.read()
 45 |             if ret:
 46 |                 yield frame
 47 |             else:
 48 |                 break
 49 |     else:
 50 |         images = sorted(glob(os.path.join(video_name, 'img', '*.jp*')))
 51 |         for img in images:
 52 |             frame = cv2.imread(img)
 53 |             yield frame
 54 | 
 55 | class Preprocessor_wo_mask(object):
 56 |     def __init__(self):
 57 |         self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).cuda()
 58 |         self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).cuda()
 59 | 
 60 |     def process(self, img_arr: np.ndarray):
 61 |         # Deal with the image patch
 62 |         img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
 63 |         img_tensor_norm = ((img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)
 64 |         return img_tensor_norm.contiguous()
 65 | 
 66 | class MFTrackerORT:
 67 |     def __init__(self, model_path, fp16=False) -> None:
 68 |         self.debug = True
 69 |         self.gpu_id = 0
 70 |         self.providers = ["CUDAExecutionProvider"]
 71 |         self.provider_options = [{"device_id": str(self.gpu_id)}]
 72 |         self.model_path = model_path
 73 |         self.fp16 = fp16
 74 |         
 75 |         self.init_track_net()
 76 |         self.preprocessor = Preprocessor_wo_mask()
 77 |         self.max_score_decay = 1.0
 78 |         self.search_factor = 4.5
 79 |         self.search_size = 224
 80 |         self.template_factor = 2.0
 81 |         self.template_size = 112
 82 |         self.update_interval = 200
 83 |         self.online_size = 1
 84 | 
 85 |     def init_track_net(self):
 86 |         """使用设置的参数初始化tracker网络
 87 |         """
 88 |         self.ort_session = onnxruntime.InferenceSession(self.model_path, providers=self.providers, provider_options=self.provider_options)
 89 | 
 90 |     def track_init(self, frame, target_pos=None, target_sz = None):
 91 |         """使用第一帧进行初始化
 92 | 
 93 |         Args:
 94 |             frame (_type_): _description_
 95 |             target_pos (_type_, optional): _description_. Defaults to None.
 96 |             target_sz (_type_, optional): _description_. Defaults to None.
 97 |         """
 98 |         self.trace_list = []
 99 |         try:
100 |             # [x, y, w, h]
101 |             init_state = [target_pos[0], target_pos[1], target_sz[0], target_sz[1]]
102 |             z_patch_arr, _, z_amask_arr = self.sample_target(frame, init_state, self.template_factor, output_sz=self.template_size)
103 |             template = self.preprocessor.process(z_patch_arr)
104 |             self.template = template
105 |             self.online_template = template
106 | 
107 |             self.online_state = init_state
108 |             self.online_image = frame
109 |             self.max_pred_score = -1.0
110 |             self.online_max_template = template
111 |             self.online_forget_id = 0
112 | 
113 |             # save states
114 |             self.state = init_state
115 |             self.frame_id = 0
116 |             print(f"第一帧初始化完毕！")
117 |         except:
118 |             print(f"第一帧初始化异常！")
119 |             exit()
120 | 
121 |     def track(self, image, info: dict = None):
122 |         H, W, _ = image.shape
123 |         self.frame_id += 1
124 |         x_patch_arr, resize_factor, x_amask_arr = self.sample_target(image, self.state, self.search_factor,
125 |                                                                 output_sz=self.search_size)  # (x1, y1, w, h)
126 |         search = self.preprocessor.process(x_patch_arr)
127 | 
128 |         # compute ONNX Runtime output prediction
129 |         ort_inputs = {'img_t': self.to_numpy(self.template), 'img_ot': self.to_numpy(self.online_template), 'img_search': self.to_numpy(search)}
130 | 
131 |         ort_outs = self.ort_session.run(None, ort_inputs)
132 | 
133 |         # print(f">>> lenght trt_outputs: {ort_outs}")
134 |         pred_boxes = torch.from_numpy(ort_outs[0])
135 |         pred_score = torch.from_numpy(ort_outs[1])
136 |         # print(f">>> box and score: {pred_boxes}  {pred_score}")
137 |         # Baseline: Take the mean of all pred boxes as the final result
138 |         pred_box = (pred_boxes.mean(dim=0) * self.search_size / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
139 |         # get the final box result
140 |         self.state = self.clip_box(self.map_box_back(pred_box, resize_factor), H, W, margin=10)
141 | 
142 |         self.max_pred_score = self.max_pred_score * self.max_score_decay
143 |         # update template
144 |         if pred_score > 0.5 and pred_score > self.max_pred_score:
145 |             z_patch_arr, _, z_amask_arr = self.sample_target(image, self.state,
146 |                                                         self.template_factor,
147 |                                                         output_sz=self.template_size)  # (x1, y1, w, h)
148 |             self.online_max_template = self.preprocessor.process(z_patch_arr)
149 |             self.max_pred_score = pred_score
150 | 
151 |         
152 |         if self.frame_id % self.update_interval == 0:
153 |             if self.online_size == 1:
154 |                 self.online_template = self.online_max_template
155 |             else:
156 |                 self.online_template[self.online_forget_id:self.online_forget_id+1] = self.online_max_template
157 |                 self.online_forget_id = (self.online_forget_id + 1) % self.online_size
158 | 
159 |             self.max_pred_score = -1
160 |             self.online_max_template = self.template
161 | 
162 |         # for debug
163 |         if self.debug:
164 |             x1, y1, w, h = self.state
165 |             # image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
166 |             cv2.rectangle(image, (int(x1),int(y1)), (int(x1+w),int(y1+h)), color=(0,0,255), thickness=2)
167 | 
168 |         return {"target_bbox": self.state, "conf_score": pred_score}
169 | 
170 |     def map_box_back(self, pred_box: list, resize_factor: float):
171 |         cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
172 |         cx, cy, w, h = pred_box
173 |         half_side = 0.5 * self.search_size / resize_factor
174 |         cx_real = cx + (cx_prev - half_side)
175 |         cy_real = cy + (cy_prev - half_side)
176 |         return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]
177 | 
178 |     def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
179 |         cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
180 |         cx, cy, w, h = pred_box.unbind(-1) # (N,4) --> (N,)
181 |         half_side = 0.5 * self.search_size / resize_factor
182 |         cx_real = cx + (cx_prev - half_side)
183 |         cy_real = cy + (cy_prev - half_side)
184 |         return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)
185 |     
186 |     def to_numpy(self, tensor):
187 |         if self.fp16:
188 |             return tensor.detach().cpu().half().numpy() if tensor.requires_grad else tensor.cpu().half().numpy()
189 |         return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
190 |     
191 |     def sample_target(self, im, target_bb, search_area_factor, output_sz=None, mask=None):
192 |         """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area
193 | 
194 |         args:
195 |             im - cv image
196 |             target_bb - target box [x, y, w, h]
197 |             search_area_factor - Ratio of crop size to target size
198 |             output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.
199 | 
200 |         returns:
201 |             cv image - extracted crop
202 |             float - the factor by which the crop has been resized to make the crop size equal output_size
203 |         """
204 |         if not isinstance(target_bb, list):
205 |             x, y, w, h = target_bb.tolist()
206 |         else:
207 |             x, y, w, h = target_bb
208 |         # Crop image
209 |         crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)
210 | 
211 |         if crop_sz < 1:
212 |             raise Exception('Too small bounding box.')
213 | 
214 |         x1 = int(round(x + 0.5 * w - crop_sz * 0.5))
215 |         x2 = int(x1 + crop_sz)
216 | 
217 |         y1 = int(round(y + 0.5 * h - crop_sz * 0.5))
218 |         y2 = int(y1 + crop_sz)
219 | 
220 |         x1_pad = int(max(0, -x1))
221 |         x2_pad = int(max(x2 - im.shape[1] + 1, 0))
222 | 
223 |         y1_pad = int(max(0, -y1))
224 |         y2_pad = int(max(y2 - im.shape[0] + 1, 0))
225 | 
226 |         # Crop target
227 |         im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
228 |         if mask is not None:
229 |             mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad]
230 | 
231 |         # Pad
232 |         im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv2.BORDER_CONSTANT)
233 |         # deal with attention mask
234 |         H, W, _ = im_crop_padded.shape
235 |         att_mask = np.ones((H,W))
236 |         end_x, end_y = -x2_pad, -y2_pad
237 |         if y2_pad == 0:
238 |             end_y = None
239 |         if x2_pad == 0:
240 |             end_x = None
241 |         att_mask[y1_pad:end_y, x1_pad:end_x] = 0
242 |         if mask is not None:
243 |             mask_crop_padded = F.pad(mask_crop, pad=(x1_pad, x2_pad, y1_pad, y2_pad), mode='constant', value=0)
244 | 
245 | 
246 |         if output_sz is not None:
247 |             resize_factor = output_sz / crop_sz
248 |             im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))
249 |             att_mask = cv2.resize(att_mask, (output_sz, output_sz)).astype(np.bool_)
250 |             if mask is None:
251 |                 return im_crop_padded, resize_factor, att_mask
252 |             mask_crop_padded = \
253 |             F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz), mode='bilinear', align_corners=False)[0, 0]
254 |             return im_crop_padded, resize_factor, att_mask, mask_crop_padded
255 | 
256 |         else:
257 |             if mask is None:
258 |                 return im_crop_padded, att_mask.astype(np.bool_), 1.0
259 |             return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded
260 |         
261 |     def clip_box(self, box: list, H, W, margin=0):
262 |         x1, y1, w, h = box
263 |         x2, y2 = x1 + w, y1 + h
264 |         x1 = min(max(0, x1), W-margin)
265 |         x2 = min(max(margin, x2), W)
266 |         y1 = min(max(0, y1), H-margin)
267 |         y2 = min(max(margin, y2), H)
268 |         w = max(margin, x2-x1)
269 |         h = max(margin, y2-y1)
270 |         return [x1, y1, w, h]
271 |         
272 | if __name__ == '__main__':
273 |     print("测试")
274 |     model_path = "model/mixformer_v2_sim.onnx"
275 |     Tracker = MFTrackerORT(model_path = model_path, fp16=False)
276 |     first_frame = True
277 |     Tracker.video_name = "/home/nhy/lsm/dataset/target.mp4"
278 | 
279 |     if Tracker.video_name:
280 |         video_name = Tracker.video_name
281 |     else:
282 |         video_name = 'webcam'
283 |     cv2.namedWindow(video_name, cv2.WND_PROP_FULLSCREEN)
284 | 
285 |     frame_id = 0
286 |     total_time = 0
287 |     for frame in get_frames(Tracker.video_name):
288 |         # print(f"frame shape {frame.shape}")
289 |         tic = cv2.getTickCount()
290 |         if first_frame:
291 |             x, y, w, h = cv2.selectROI(video_name, frame, fromCenter=False)
292 | 
293 |             target_pos = [x, y]
294 |             target_sz = [w, h]
295 |             print('====================type=================', target_pos, type(target_pos), type(target_sz))
296 |             Tracker.track_init(frame, target_pos, target_sz)
297 |             first_frame = False
298 |         else:
299 |             state = Tracker.track(frame)
300 |             frame_id += 1
301 | 
302 |             cv2.imshow('Tracking', frame)
303 |             cv2.waitKey(1)
304 | 
305 |         toc = cv2.getTickCount() - tic
306 |         toc = int(1 / (toc / cv2.getTickFrequency()))
307 |         total_time += toc
308 |         print('Video: {:12s} {:3.1f}fps'.format('tracking', toc))
309 |     
310 |     print('video: average {:12s} {:3.1f} fps'.format('finale average tracking fps', total_time/(frame_id - 1)))
311 |     cv2.destroyAllWindows()
312 | 


--------------------------------------------------------------------------------
/mixformer-pytrt/mf_tracker_trt.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import numpy as np
  5 | import time
  6 | import cv2
  7 | import glob
  8 | import onnxruntime
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import math
 12 | 
 13 | # prj_path = os.path.join(os.path.dirname(__file__), '..')
 14 | # if prj_path not in sys.path:
 15 | #     sys.path.append(prj_path)
 16 | 
 17 | from mixformer_nvinfer import MixformerNvinfer
 18 | 
 19 | prj_path = os.path.join(os.path.dirname(__file__), '..')
 20 | if prj_path not in sys.path:
 21 |     sys.path.append(prj_path)
 22 | 
 23 | def get_frames(video_name):
 24 |     """获取视频帧
 25 | 
 26 |     Args:
 27 |         video_name (_type_): _description_
 28 | 
 29 |     Yields:
 30 |         _type_: _description_
 31 |     """
 32 |     if not video_name:
 33 |         rtsp = "rtsp://%s:%s@%s:554/cam/realmonitor?channel=1&subtype=1" % ("admin", "123456", "192.168.1.108")
 34 |         cap = cv2.VideoCapture(rtsp) if rtsp else cv2.VideoCapture()
 35 |         
 36 |         # warmup
 37 |         for i in range(5):
 38 |             cap.read()
 39 |         while True:
 40 |             ret, frame = cap.read()
 41 |             if ret:
 42 |                 # print('读取成功===>>>', frame.shape)
 43 |                 yield cv2.resize(frame,(800, 600))
 44 |             else:
 45 |                 break
 46 |     elif video_name.endswith('avi') or \
 47 |         video_name.endswith('mp4'):
 48 |         cap = cv2.VideoCapture(video_name)
 49 |         while True:
 50 |             ret, frame = cap.read()
 51 |             if ret:
 52 |                 yield frame
 53 |             else:
 54 |                 break
 55 |     else:
 56 |         images = sorted(glob(os.path.join(video_name, 'img', '*.jp*')))
 57 |         for img in images:
 58 |             frame = cv2.imread(img)
 59 |             yield frame
 60 | 
 61 | class Preprocessor_wo_mask(object):
 62 |     def __init__(self):
 63 |         self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).cuda()
 64 |         self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).cuda()
 65 | 
 66 |     def process(self, img_arr: np.ndarray):
 67 |         """初始化预处理图像. 
 68 |            需要注意的是: 如果按照如下方式处理则无法通过。原因是transpose之后的数据不连续，需要将其变为连续的数
 69 |            据，所以需要先进行ascontiguousarray
 70 |            '''
 71 |             img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
 72 |             img_tensor_norm = ((img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)
 73 |             return img_tensor_norm
 74 |            '''
 75 |             因此，等效的实现包含两种，一种是直接在返回的tensor上添加contiguous,另一种是在numpy.array数据的
 76 |             时候就进行ascontiguousarray.
 77 |         Args:
 78 |             img_arr (np.ndarray): _description_
 79 | 
 80 |         Returns:
 81 |             _type_: _description_
 82 |         """
 83 |         # 第一种方法
 84 |         # im = np.ascontiguousarray(img_arr.transpose((2, 0, 1))[::-1]) # HWC to CHW, BGR to RGB
 85 |         # im = torch.from_numpy(im).cuda().float()
 86 |         # im = ((im / 255.0) - self.mean) / self.std
 87 |         # if len(im.shape) == 3:
 88 |         #     im = im[None]
 89 |             
 90 |         # print(f"preprocessor im: {im.shape}")
 91 |         # return im
 92 |         
 93 |         # 第二种方法 
 94 |         # img_arr = cv2.cvtColor(img_arr, cv2.COLOR_RGB2BGR)
 95 |         img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
 96 |         img_tensor_norm = ((img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)
 97 |         return img_tensor_norm.contiguous()
 98 | 
 99 | class MFTrackerTRT:
100 |     def __init__(self) -> None:
101 |         self.debug = True        
102 |         
103 |         self.init_track_net()
104 |         self.preprocessor = Preprocessor_wo_mask()
105 |         self.max_score_decay = 1.0
106 |         self.search_factor = 4.5
107 |         self.search_size = 224
108 |         self.template_factor = 2.0
109 |         self.template_size = 112
110 |         self.update_interval = 200
111 |         self.online_size = 1
112 | 
113 |     def init_track_net(self):
114 |         """使用设置的参数初始化tracker网络
115 |         """        
116 |         self.mixformer_tracker = MixformerNvinfer()
117 | 
118 |     def track_init(self, frame, target_pos=None, target_sz = None):
119 |         """使用第一帧进行初始化
120 | 
121 |         Args:
122 |             frame (_type_): _description_
123 |             target_pos (_type_, optional): _description_. Defaults to None.
124 |             target_sz (_type_, optional): _description_. Defaults to None.
125 |         """
126 |         self.trace_list = []
127 |         try:
128 |             # [x, y, w, h]
129 |             init_state = [target_pos[0], target_pos[1], target_sz[0], target_sz[1]]
130 |             z_patch_arr, _, z_amask_arr = self.sample_target(frame, init_state, self.template_factor, output_sz=self.template_size)
131 |             template = self.preprocessor.process(z_patch_arr)
132 |             self.template = template
133 |             self.online_template = template
134 | 
135 |             self.online_state = init_state
136 |             self.online_image = frame
137 |             self.max_pred_score = -1.0
138 |             self.online_max_template = template
139 |             self.online_forget_id = 0
140 | 
141 |             # save states
142 |             self.state = init_state
143 |             self.frame_id = 0
144 |             print(f"第一帧初始化完毕！")
145 |         except:
146 |             print(f"第一帧初始化异常！")
147 |             exit()
148 | 
149 |     def track(self, image, info: dict = None):
150 |         H, W, _ = image.shape
151 |         self.frame_id += 1
152 |         x_patch_arr, resize_factor, x_amask_arr = self.sample_target(image, self.state, self.search_factor,
153 |                                                                 output_sz=self.search_size)  # (x1, y1, w, h)
154 |         search = self.preprocessor.process(x_patch_arr)
155 |         # print(f">>>search: {search.shape}")
156 |         # compute trt output prediction
157 |         trt_outputs = self.mixformer_tracker.infer(self.template, self.online_template, search)
158 |         # print(f">>> lenght trt_outputs: {trt_outputs}")
159 |         pred_boxes = trt_outputs[0]
160 |         pred_score = trt_outputs[1]
161 | 
162 |         print(f">>> trt_outputs: {pred_boxes, pred_score}")
163 |         # Baseline: Take the mean of all pred boxes as the final result
164 |         pred_box = (pred_boxes.mean(dim=0) * self.search_size / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
165 |         # get the final box result
166 |         self.state = self.clip_box(self.map_box_back(pred_box, resize_factor), H, W, margin=10)
167 | 
168 |         self.max_pred_score = self.max_pred_score * self.max_score_decay
169 |         # update template
170 |         if pred_score > 0.5 and pred_score > self.max_pred_score:
171 |             z_patch_arr, _, z_amask_arr = self.sample_target(image, self.state,
172 |                                                         self.template_factor,
173 |                                                         output_sz=self.template_size)  # (x1, y1, w, h)
174 |             self.online_max_template = self.preprocessor.process(z_patch_arr)
175 |             self.max_pred_score = pred_score
176 | 
177 |         
178 |         if self.frame_id % self.update_interval == 0:
179 |             if self.online_size == 1:
180 |                 self.online_template = self.online_max_template
181 |             else:
182 |                 self.online_template[self.online_forget_id:self.online_forget_id+1] = self.online_max_template
183 |                 self.online_forget_id = (self.online_forget_id + 1) % self.online_size
184 | 
185 |             self.max_pred_score = -1
186 |             self.online_max_template = self.template
187 | 
188 |         # for debug
189 |         if self.debug:
190 |             x1, y1, w, h = self.state
191 |             # image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
192 |             cv2.rectangle(image, (int(x1),int(y1)), (int(x1+w),int(y1+h)), color=(0,0,255), thickness=2)
193 | 
194 |         return {"target_bbox": self.state, "conf_score": pred_score}
195 | 
196 |     def map_box_back(self, pred_box: list, resize_factor: float):
197 |         cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
198 |         cx, cy, w, h = pred_box
199 |         half_side = 0.5 * self.search_size / resize_factor
200 |         cx_real = cx + (cx_prev - half_side)
201 |         cy_real = cy + (cy_prev - half_side)
202 |         return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]
203 | 
204 |     def map_box_back_batch(self, pred_box: torch.Tensor, resize_factor: float):
205 |         cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[1] + 0.5 * self.state[3]
206 |         cx, cy, w, h = pred_box.unbind(-1) # (N,4) --> (N,)
207 |         half_side = 0.5 * self.search_size / resize_factor
208 |         cx_real = cx + (cx_prev - half_side)
209 |         cy_real = cy + (cy_prev - half_side)
210 |         return torch.stack([cx_real - 0.5 * w, cy_real - 0.5 * h, w, h], dim=-1)
211 |     
212 |     def to_numpy(self, tensor):
213 |         return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
214 |     
215 |     def sample_target(self, im, target_bb, search_area_factor, output_sz=None, mask=None):
216 |         """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area
217 | 
218 |         args:
219 |             im - cv image
220 |             target_bb - target box [x, y, w, h]
221 |             search_area_factor - Ratio of crop size to target size
222 |             output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.
223 | 
224 |         returns:
225 |             cv image - extracted crop
226 |             float - the factor by which the crop has been resized to make the crop size equal output_size
227 |         """
228 |         if not isinstance(target_bb, list):
229 |             x, y, w, h = target_bb.tolist()
230 |         else:
231 |             x, y, w, h = target_bb
232 |         # Crop image
233 |         crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)
234 | 
235 |         if crop_sz < 1:
236 |             raise Exception('Too small bounding box.')
237 | 
238 |         x1 = int(round(x + 0.5 * w - crop_sz * 0.5))
239 |         x2 = int(x1 + crop_sz)
240 | 
241 |         y1 = int(round(y + 0.5 * h - crop_sz * 0.5))
242 |         y2 = int(y1 + crop_sz)
243 | 
244 |         x1_pad = int(max(0, -x1))
245 |         x2_pad = int(max(x2 - im.shape[1] + 1, 0))
246 | 
247 |         y1_pad = int(max(0, -y1))
248 |         y2_pad = int(max(y2 - im.shape[0] + 1, 0))
249 | 
250 |         # Crop target
251 |         im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
252 |         if mask is not None:
253 |             mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad]
254 | 
255 |         # Pad
256 |         im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv2.BORDER_CONSTANT)
257 |         # deal with attention mask
258 |         H, W, _ = im_crop_padded.shape
259 |         att_mask = np.ones((H,W))
260 |         end_x, end_y = -x2_pad, -y2_pad
261 |         if y2_pad == 0:
262 |             end_y = None
263 |         if x2_pad == 0:
264 |             end_x = None
265 |         att_mask[y1_pad:end_y, x1_pad:end_x] = 0
266 |         if mask is not None:
267 |             mask_crop_padded = F.pad(mask_crop, pad=(x1_pad, x2_pad, y1_pad, y2_pad), mode='constant', value=0)
268 | 
269 | 
270 |         if output_sz is not None:
271 |             resize_factor = output_sz / crop_sz
272 |             im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))
273 |             att_mask = cv2.resize(att_mask, (output_sz, output_sz)).astype(np.bool_)
274 |             if mask is None:
275 |                 return im_crop_padded, resize_factor, att_mask
276 |             mask_crop_padded = \
277 |             F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz), mode='bilinear', align_corners=False)[0, 0]
278 |             return im_crop_padded, resize_factor, att_mask, mask_crop_padded
279 | 
280 |         else:
281 |             if mask is None:
282 |                 return im_crop_padded, att_mask.astype(np.bool_), 1.0
283 |             return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded
284 |         
285 |     def clip_box(self, box: list, H, W, margin=0):
286 |         x1, y1, w, h = box
287 |         x2, y2 = x1 + w, y1 + h
288 |         x1 = min(max(0, x1), W-margin)
289 |         x2 = min(max(margin, x2), W)
290 |         y1 = min(max(0, y1), H-margin)
291 |         y2 = min(max(margin, y2), H)
292 |         w = max(margin, x2-x1)
293 |         h = max(margin, y2-y1)
294 |         return [x1, y1, w, h]
295 |         
296 | if __name__ == '__main__':
297 |     print("测试")
298 |     Tracker = MFTrackerTRT()
299 |     Tracker.video_name = "/home/nhy/lsm/dataset/target.mp4"
300 |     # init_state = [282, 250, 23, 23]
301 | 
302 |     # warm_up = 500
303 |     # warm_up_first = True
304 |     # input0= torch.rand((112, 112, 3)).numpy()
305 |     # input1= torch.rand((224, 224, 3)).numpy()
306 |     # for i in range(warm_up):
307 |     #     if warm_up_first == True:
308 |     #         Tracker.track_init(input0, [20, 20], [50, 50])
309 |     #         warm_up_first = False
310 |     #     else:
311 |     #         state = Tracker.track(input1)
312 | 
313 |     first_frame = True
314 |     if Tracker.video_name:
315 |         video_name = Tracker.video_name
316 |     else:
317 |         video_name = 'webcam'
318 | 
319 |     frame_id = 0
320 |     total_time = 0
321 |     for frame in get_frames(Tracker.video_name):
322 |         # print(f"frame shape {frame.shape} {type(frame)}")
323 |         tic = cv2.getTickCount()
324 |         if first_frame:
325 |             x, y, w, h = cv2.selectROI(video_name, frame, fromCenter=False)
326 |             print(f">>>init state: {(x, y, w, h)}")
327 |             target_pos = [x, y]
328 |             target_sz = [w, h]
329 |             # target_pos = [init_state[0], init_state[1]]
330 |             # target_sz = [init_state[2], init_state[3]]
331 |             Tracker.track_init(frame, target_pos, target_sz)
332 |             first_frame = False
333 |         else:
334 |             state = Tracker.track(frame)
335 |             frame_id += 1
336 | 
337 |         toc = cv2.getTickCount() - tic
338 |         toc = int(1 / (toc / cv2.getTickFrequency()))
339 |         total_time += toc
340 |         print('Video: {:12s} {:3.1f}fps'.format('tracking', toc))
341 |         cv2.imshow('Tracking', frame)
342 |         cv2.waitKey(1)
343 |     
344 |     print('video: average {:12s} {:3.1f} fps'.format('finale average tracking fps', total_time/(frame_id - 1)))
345 |     cv2.destroyAllWindows()
346 | 


--------------------------------------------------------------------------------
/mixformer-pytrt/mixformer_construct_trt.py:
--------------------------------------------------------------------------------
  1 | import tensorrt as trt
  2 | from pathlib import Path
  3 | import struct
  4 | import numpy as np
  5 | import pycuda
  6 | import torch
  7 | 
  8 | verbose = True
  9 | IN_NAME1 = 'img_t'
 10 | IN_NAME2 = 'img_ot'
 11 | IN_NAME3 = 'img_search'
 12 | OUT_NAME1 = 'pred_boxes'
 13 | OUT_NAME2 = 'pred_scores'
 14 | IN_H1 = 112
 15 | IN_H2 = 112
 16 | IN_H3 = 224
 17 | IN_W1 = 112
 18 | IN_W2 = 112
 19 | IN_W3 = 224
 20 | BATCH_SIZE = 1
 21 | 
 22 | def read_wts(filename):
 23 |     # 读取权重文件
 24 |     weights = {}
 25 |     with open(filename, 'r') as f:
 26 |         # 读取权重数量
 27 |         num_weights = int(f.readline().strip())
 28 |         print(f'>>> num_weights: {num_weights}')
 29 |         for i in range(num_weights):
 30 |             # if i == 0:
 31 |             line = f.readline().strip()
 32 |             # print(f'>>> line: {line}')
 33 |             parts = line.split(' ')
 34 | 
 35 |             # 获取权重名称、大小
 36 |             name = parts[0]
 37 |             size = parts[1]
 38 |             # print(f">>>name and size: {name} {size}  {len(parts[2:])}")
 39 | 
 40 |             # 获取权重值
 41 |             values = [struct.unpack('!f', bytes.fromhex(x))[0] for x in parts[2:2+int(size)]]
 42 |             weights[name] = values
 43 |     print(f">>>wts name: {weights.keys()}")
 44 |     return weights
 45 | 
 46 | 
 47 | def reshape(network, input_tensor, new_shape):
 48 |     shuffle_layer = network.add_shuffle(input_tensor)
 49 |     shuffle_layer.reshape_dims = new_shape
 50 | 
 51 |     return shuffle_layer
 52 | 
 53 | 
 54 | def transpose(network, input_tensor, perm):
 55 |     # 创建一个shuffle层来实现transpose操作
 56 |     shuffle_layer = network.add_shuffle(input_tensor)
 57 | 
 58 |     # 假设你想将一个形状为(3, 32, 32)的tensor置换为(32, 32, 3)
 59 |     # 原始的顺序是[0, 1, 2]，新的顺序是[1, 2, 0]
 60 |     shuffle_layer.first_transpose = trt.Permutation(perm)
 61 |     return shuffle_layer
 62 | 
 63 | 
 64 | def layer_norm(network, weight, bias, input_tensor, exp):
 65 |     # 先进性标准化
 66 |     reduce_axes = 4 << 0  # 选择第一个维度
 67 |     reduce_layer = network.add_reduce(input_tensor, trt.ReduceOperation.AVG, reduce_axes, True)
 68 |     sub_tensor = network.add_elementwise(input_tensor, reduce_layer.get_output(0), trt.ElementWiseOperation.SUB)
 69 |     # 定义常数2.0
 70 |     constant_shape = input_tensor.shape
 71 |     constant_value = np.full(constant_shape, 2.0, dtype=np.float32)
 72 |     constant_weights = trt.Weights(constant_value.ravel())
 73 |     constant_layer = network.add_constant(constant_shape, constant_weights)
 74 |     # pow(sub_tensor, 2.0)
 75 |     pow_tensor = network.add_elementwise(sub_tensor.get_output(0), constant_layer.get_output(0), trt.ElementWiseOperation.POW)
 76 |     # reduce mean
 77 |     reduce_layer = network.add_reduce(pow_tensor.get_output(0), trt.ReduceOperation.AVG, reduce_axes, True)
 78 |     # add
 79 |     constant_shape = reduce_layer.get_output(0).shape
 80 |     constant_value = np.full(constant_shape, 9.999999974752427e-7, dtype=np.float32)
 81 |     constant_weights = trt.Weights(constant_value.ravel())
 82 |     constant_layer = network.add_constant(constant_shape, constant_weights)
 83 |     add_tensor = network.add_elementwise(reduce_layer.get_output(0), constant_layer.get_output(0), trt.ElementWiseOperation.SUM)    
 84 |     # sqrt
 85 |     sqrt_layer = network.add_unary(add_tensor.get_output(0), trt.UnaryOperation.SQRT)
 86 |     # div
 87 |     div_tensor = network.add_elementwise(sub_tensor.get_output(0), sqrt_layer.get_output(0), trt.ElementWiseOperation.DIV)
 88 |     # mul(div_tensor, norm1.weight)
 89 |     weight = np.array(weight).astype(np.float32)
 90 |     weight_shape = weight.shape
 91 |     weight = trt.Weights(weight.ravel())
 92 |     weight_constant = network.add_constant(weight_shape, weight)
 93 |     # 需要reshape成和div_tensor一样的形状
 94 |     weight_constant = reshape(network=network, input_tensor=weight_constant.get_output(0), new_shape=trt.Dims3(1, 1, weight_shape[0]))
 95 |     mul_tensor = network.add_elementwise(div_tensor.get_output(0), weight_constant.get_output(0), trt.ElementWiseOperation.PROD)
 96 |     # add
 97 |     bias = np.array(bias).astype(np.float32)
 98 |     bias_shape = bias.shape
 99 |     bias = trt.Weights(bias.ravel())
100 |     bias_constant = network.add_constant(bias_shape, bias)
101 |     # # 需要reshape成和div_tensor一样的形状
102 |     bias_constant = reshape(network=network, input_tensor=bias_constant.get_output(0), new_shape=trt.Dims3(1, 1, bias_shape[0]))
103 |     add_tensor = network.add_elementwise(mul_tensor.get_output(0), bias_constant.get_output(0), trt.ElementWiseOperation.SUM)
104 |     
105 |     return add_tensor
106 | 
107 | 
108 | def matmul(network, input_tensor, weight):
109 |     """实现矩阵乘的操作
110 | 
111 |     Args:
112 |         network (_type_): _description_
113 |         input_tensor (_type_): _description_
114 |         weight (_type_): _description_
115 |         bias (_type_): _description_
116 |     """
117 |     mm_layer = network.add_matrix_multiply(input_tensor, trt.MatrixOperation.NONE, weight, trt.MatrixOperation.NONE)
118 |     return mm_layer
119 | 
120 | 
121 | def mul(network, input_tensor, weight, weight_shape):
122 |     """实现input_tensor与weight的乘法操作
123 | 
124 |     Args:
125 |         network (_type_): _description_
126 |         input_tensor (_type_): _description_
127 |         weight (_type_): list
128 |     """
129 |     # mul(input_tensor, weight)
130 |     weight = np.array(weight).astype(np.float32)
131 |     weight = trt.Weights(weight.ravel())
132 |     weight_constant = network.add_constant(weight_shape, weight)
133 |     # input_tensor
134 |     weight_constant = reshape(network=network, input_tensor=weight_constant.get_output(0), new_shape=weight_shape)
135 |     # print(f">>>mul: {weight_constant.get_output(0).shape} {input_tensor.shape}")
136 |     mul_layer = network.add_elementwise(input_tensor, weight_constant.get_output(0), trt.ElementWiseOperation.PROD)
137 |     return mul_layer
138 | 
139 | 
140 | def attention(network, qkv_weight, qkv_bias, proj_weight, proj_bias, input_tensor):
141 |     """对layernorm后的数据进行attention
142 | 
143 |     Args:
144 |         network (_type_): _description_
145 |         dim (_type_): _description_
146 |         num_head (_type_): _description_
147 |         attn_drop (_type_): _description_
148 |         proj_drop (_type_): _description_
149 |         qkv_weight (_type_): _description_
150 |         qkv_bias (_type_): _description_
151 |         proj_weight (_type_): _description_
152 |         proj_bias (_type_): _description_
153 |     """
154 |     # print(f">>> input_tensor shape: {input_tensor.shape}")
155 |     B, N, C = input_tensor.shape
156 |     reshape_tensor_1 = reshape(network=network, input_tensor=input_tensor, new_shape=trt.Dims2(N, C))
157 | 
158 |     # linear 1
159 |     # matmul操作
160 |     qkv_weight = np.ascontiguousarray(np.array(qkv_weight).reshape(-1, 768).transpose(1, 0)).astype(np.float32)
161 |     qkv_bias = np.array(qkv_bias).astype(np.float32)
162 |     trt_weights_1 = trt.Weights(qkv_weight)  # 注意转置，因为TRT期望的权重布局与PyTorch不同
163 |     trt_biases_1 = trt.Weights(qkv_bias)
164 |     weight_tensor_1 = network.add_constant(shape=(768, 2304), weights=trt_weights_1)
165 |     bias_tensor_1 = network.add_constant(shape=(1, 2304), weights=trt_biases_1)
166 |     mm_layer_1 = network.add_matrix_multiply(reshape_tensor_1.get_output(0), trt.MatrixOperation.NONE, weight_tensor_1.get_output(0), trt.MatrixOperation.NONE)
167 |     # add
168 |     add_layer_1 = network.add_elementwise(mm_layer_1.get_output(0), bias_tensor_1.get_output(0), trt.ElementWiseOperation.SUM)
169 |     
170 |     # reshape
171 |     fc_layer_1_shape = trt.Dims([B, N, 3, 12, C//12])
172 |     fc_layer_1 = reshape(network=network, input_tensor=add_layer_1.get_output(0), new_shape=fc_layer_1_shape)
173 |     transpose_1 = transpose(network=network, input_tensor=fc_layer_1.get_output(0), perm=[2, 0, 3, 1, 4])
174 | 
175 |     # unbind(0)
176 |     q, k, v = split(network=network, input_tensor=transpose_1.get_output(0), axis=0)
177 |     # squeeze
178 |     q = reshape(network=network, input_tensor=q, new_shape=trt.Dims(q.shape[1:]))
179 |     k = reshape(network=network, input_tensor=k, new_shape=trt.Dims(k.shape[1:]))
180 |     v = reshape(network=network, input_tensor=v, new_shape=trt.Dims(v.shape[1:]))
181 | 
182 |     # 3 * split
183 |     q_98_split, q_200_split = split2(network=network, input_tensor=q.get_output(0))
184 |     k_98_split, k_200_split = split2(network=network, input_tensor=k.get_output(0))
185 |     v_98_split, v_200_split = split2(network=network, input_tensor=v.get_output(0))
186 | 
187 |     k_98_split = transpose(network=network, input_tensor=k_98_split.get_output(0), perm=[0, 1, 3, 2])
188 |     q98_k98_mm_layer = matmul(network=network, input_tensor=q_98_split.get_output(0), weight=k_98_split.get_output(0))
189 |     qk98_mul_layer = mul(network=network, input_tensor=q98_k98_mm_layer.get_output(0), weight=0.125, weight_shape=trt.Dims([1, 1, 1, 1]))
190 | 
191 |     # softmax1
192 |     qk98_softmax_layer = network.add_softmax(qk98_mul_layer.get_output(0))
193 | 
194 |     # k_transpose
195 |     k_transpose = transpose(network=network, input_tensor=k.get_output(0), perm=[0, 1, 3, 2])
196 |     q200ktranspose_mm_layer = matmul(network=network, input_tensor=q_200_split.get_output(0), weight=k_transpose.get_output(0))
197 |     q200ktranspose_mul_layer = mul(network=network, input_tensor=q200ktranspose_mm_layer.get_output(0),
198 |                                       weight=0.125, weight_shape=trt.Dims([1, 1, 1, 1]))
199 |     # softmax2
200 |     q200ktranspose_softmax_layer = network.add_softmax(q200ktranspose_mul_layer.get_output(0))
201 | 
202 |     # matmul(qk98softmax, v_98_split) transpose reshape
203 |     qk98softmax_v98split_mm_layer = matmul(network=network, input_tensor=qk98_softmax_layer.get_output(0),
204 |                                          weight=v_98_split.get_output(0))
205 |     qk98v98_transpose_layer = transpose(network=network, input_tensor=qk98softmax_v98split_mm_layer.get_output(0), perm=[0, 2, 1, 3])
206 |     dim = qk98v98_transpose_layer.get_output(0).shape
207 |     qk98v98_reshape = reshape(network=network, input_tensor=qk98v98_transpose_layer.get_output(0), 
208 |                             new_shape=trt.Dims([dim[0], dim[1], dim[2]*dim[3]]))
209 |     
210 |     # matmul(q200k98transpose_softmax, v)  transpose reshape
211 |     q200_ktranspose_v_mm_layer = matmul(network=network, input_tensor=q200ktranspose_softmax_layer.get_output(0),
212 |                                          weight=v.get_output(0))
213 |     q200kv_transpose_layer = transpose(network=network, input_tensor=q200_ktranspose_v_mm_layer.get_output(0), perm=[0, 2, 1, 3])
214 |     dim = q200kv_transpose_layer.get_output(0).shape
215 |     q200kv_reshape = reshape(network=network, input_tensor=q200kv_transpose_layer.get_output(0), 
216 |                              new_shape=trt.Dims([dim[0], dim[1], dim[2]*dim[3]]))
217 |     
218 |     # concat the qk98v98_reshape and q200kv_reshape
219 |     qk98v98_q200kv_concat_layer = network.add_concatenation([qk98v98_reshape.get_output(0), q200kv_reshape.get_output(0)])
220 |     qk98v98_q200kv_concat_layer.axis = 1
221 | 
222 |     # linear 2
223 |     # matmul操作
224 |     proj_weight = np.ascontiguousarray(np.array(proj_weight).reshape(-1, 768).transpose(1, 0)).astype(np.float32)
225 |     proj_bias = np.array(proj_bias).astype(np.float32)
226 |     trt_weights_2 = trt.Weights(proj_weight)  # 注意转置，因为TRT期望的权重布局与PyTorch不同
227 |     trt_biases_2 = trt.Weights(proj_bias)
228 |     weight_tensor_2 = network.add_constant(shape=(1, 768, 768), weights=trt_weights_2)
229 |     bias_tensor_2 = network.add_constant(shape=(1, 1, 768), weights=trt_biases_2)
230 | 
231 |     mm_layer_2 = network.add_matrix_multiply(qk98v98_q200kv_concat_layer.get_output(0), trt.MatrixOperation.NONE, 
232 |                                              weight_tensor_2.get_output(0), trt.MatrixOperation.NONE)
233 |     # add
234 |     add_layer_2 = network.add_elementwise(mm_layer_2.get_output(0), bias_tensor_2.get_output(0), trt.ElementWiseOperation.SUM)
235 | 
236 |     # print(f">>> fc shape: {mm_layer_2.get_output(0).shape} {add_layer_2.get_output(0).shape}")
237 |     return add_layer_2
238 |     
239 | 
240 | def split(network, input_tensor, axis=0):
241 |     """将张量沿着指定的维度拆分成若干个张量,并返回一个元组(tuple)包含这些张量
242 | 
243 |     Args:
244 |         input_tensor (_type_): _description_
245 |         axis (int, optional): _description_. Defaults to 0.
246 |     """
247 |     dim = input_tensor.shape
248 |     depth = input_tensor.shape[axis]
249 | 
250 |     # 为每个depth切片创建一个slice layer
251 |     slices = []
252 |     # start = ()
253 |     for i in range(depth):
254 |         slice_layer = network.add_slice(input_tensor, start=(i, 0, 0, 0, 0), shape=(1, dim[1], dim[2], dim[3], dim[4]), stride=(1, 1, 1, 1, 1))
255 |         slices.append(slice_layer.get_output(0))
256 | 
257 |     return slices
258 | 
259 | 
260 | def split2(network, input_tensor):
261 |     """将张量两个张量分别为(1, 12, 98, 64)和(1, 12, 98, 200),并返回对应的层。
262 | 
263 |     Args:
264 |         input_tensor (_type_): _description_
265 |         axis (int, optional): _description_. Defaults to 0.
266 |     """
267 | 
268 |     # 第一部分：从(1, 12, 0, 64)到(1, 12, 98, 64)
269 |     first_slice_layer = network.add_slice(input_tensor, start=(0, 0, 0, 0), shape=(1, 12, 98, 64), stride=(1, 1, 1, 1))
270 |     # 第二部分：从(1, 12, 98, 64)到(1, 12, 200, 64)
271 |     second_slice_layer = network.add_slice(input_tensor, start=(0, 0, 98, 0), shape=(1, 12, 200, 64), stride=(1, 1, 1, 1))
272 |     # print(f">>>split_list2: {first_slice_layer.get_output(0).shape} {second_slice_layer.get_output(0).shape}")
273 |     return first_slice_layer, second_slice_layer
274 | 
275 | 
276 | def split4(network, input_tensor, split_list, axis):
277 |     """将张量分为四个部分，四个部分的shape分别为[a, b, c, d]
278 | 
279 |     Args:
280 |         network (_type_): _description_
281 |         input_tensor (_type_): _description_
282 |     """
283 |     # print(f">>>split_list: {split_list}")
284 |     B, N, C = input_tensor.shape
285 |     x, y, z, j = split_list
286 |     # 第一部分 从0到a
287 |     if axis == 2:
288 |         first_slice_layer = network.add_slice(input_tensor, start=(0, 0, 0), shape=(B, N, x), stride=(1, 1, 1))
289 |         # 第二部分 从a到b
290 |         second_slice_layer = network.add_slice(input_tensor, start=(0, 0, x), shape=(B, N, y), stride=(1, 1, 1))
291 |         # 第三部分 从b到c
292 |         third_slice_layer = network.add_slice(input_tensor, start=(0, 0, x+y), shape=(B, N, z), stride=(1, 1, 1))
293 |         # 第四部分 从c到d
294 |         four_slice_layer = network.add_slice(input_tensor, start=(0, 0, x+y+z), shape=(B, N, j), stride=(1, 1, 1))
295 | 
296 |     if axis == 1:
297 |         first_slice_layer = network.add_slice(input_tensor, start=(0, 0, 0), shape=(B, x, C), stride=(1, 1, 1))
298 |         # 第二部分 从a到b
299 |         second_slice_layer = network.add_slice(input_tensor, start=(0, x, 0), shape=(B, y, C), stride=(1, 1, 1))
300 |         # 第三部分 从b到c
301 |         third_slice_layer = network.add_slice(input_tensor, start=(0, x+y, 0), shape=(B, z, C), stride=(1, 1, 1))
302 |         # 第四部分 从c到d
303 |         four_slice_layer = network.add_slice(input_tensor, start=(0, x+y+z, 0), shape=(B, j, C), stride=(1, 1, 1))
304 | 
305 |     return first_slice_layer, second_slice_layer, third_slice_layer, four_slice_layer
306 | 
307 | 
308 | def drop_path(network, input_tensor, drop_prob, training=False):
309 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)
310 | 
311 |     Args:
312 |         network (_type_): _description_
313 |         input_tensor (_type_): _description_
314 |         drop_prob (_type_): _description_
315 |     """
316 |     if drop_prob > 0. or training:
317 |         return input_tensor
318 |     
319 |     # todo 可能的其他操作，未完成，该模型中不需要
320 | 
321 | 
322 | def mlp(network, input_tensor, weight, bias):
323 |     """_summary_
324 | 
325 |     Args:
326 |         network (_type_): _description_
327 |         input_tensor (_type_): _description_
328 |         weight (_type_): _description_
329 |         bias (_type_): _description_
330 |     """
331 |     # linear
332 |     # matmul操作
333 |     # B, N, C = input_tensor.shape
334 |     # print(f">>>BNC: {B} {N} {C}")
335 |     weight = np.ascontiguousarray(np.array(weight).reshape(-1, 768).transpose(1, 0)).astype(np.float32)
336 |     weight_dim = weight.shape
337 |     bias = np.array(bias).astype(np.float32)
338 |     bias_dim = bias.shape
339 |     trt_weights = trt.Weights(weight)  # 注意转置，因为TRT期望的权重布局与PyTorch不同
340 |     trt_biases = trt.Weights(bias)
341 |     weight_tensor = network.add_constant(shape=(1, weight_dim[0], weight_dim[1]), weights=trt_weights)
342 |     bias_tensor = network.add_constant(shape=(1, 1, bias_dim[0]), weights=trt_biases)
343 |     
344 |     mm_layer = network.add_matrix_multiply(input_tensor, trt.MatrixOperation.NONE, weight_tensor.get_output(0), trt.MatrixOperation.NONE)
345 |     # add
346 |     add_layer = network.add_elementwise(mm_layer.get_output(0), bias_tensor.get_output(0), trt.ElementWiseOperation.SUM)
347 | 
348 |     return add_layer
349 | 
350 | 
351 | def gelu(network, input_tensor):
352 |     """gelu激活函数
353 | 
354 |     Args:
355 |          network (_type_): _description_
356 |          input_tensor (_type_): _description_
357 |     """
358 |     # 假设你已经有一个输入ITensor named input_tensor
359 |     # input_tensor = ...
360 | 
361 |     # 定义GELU所需的常数
362 |     const_half = network.add_constant((1, 1, 1), trt.Weights(np.array([0.5], dtype=np.float32))).get_output(0)
363 |     const_sqrt_2_over_pi = network.add_constant((1, 1, 1), trt.Weights(np.array([np.sqrt(2.0 / np.pi)], dtype=np.float32))).get_output(0)
364 |     # const_0_044715 = network.add_constant((1, 1, 1), trt.Weights(np.array([0.044715], dtype=np.float32))).get_output(0)
365 | 
366 |     # 计算0.044715 * x^3
367 |     x3 = network.add_elementwise(input_tensor, input_tensor, trt.ElementWiseOperation.PROD).get_output(0)
368 |     x3 = network.add_elementwise(input_tensor, x3, trt.ElementWiseOperation.PROD).get_output(0)
369 |     scaled_x3 = network.add_scale(x3, mode=trt.ScaleMode.UNIFORM, shift=np.array([0.], dtype=np.float32), scale=np.array([0.044715], dtype=np.float32)).get_output(0)
370 |     
371 |     # 计算sqrt(2/pi) * (x + 0.044715 * x^3)
372 |     x_plus_scaled_x3 = network.add_elementwise(input_tensor, scaled_x3, trt.ElementWiseOperation.SUM).get_output(0)
373 |     scaled_result = network.add_elementwise(x_plus_scaled_x3, const_sqrt_2_over_pi, trt.ElementWiseOperation.PROD).get_output(0)
374 |     
375 |     # 计算tanh(...)
376 |     tanh_result = network.add_activation(scaled_result, trt.ActivationType.TANH).get_output(0)
377 |     
378 |     # 计算0.5 * x * (1 + tanh(...))
379 |     one_plus_tanh = network.add_elementwise(tanh_result, const_half, trt.ElementWiseOperation.SUM).get_output(0)
380 |     gelu_result_layer = network.add_elementwise(input_tensor, one_plus_tanh, trt.ElementWiseOperation.PROD)
381 | 
382 |     return gelu_result_layer
383 | 
384 | 
385 | def relu(network, input_tensor):
386 |     """relu激活函数
387 | 
388 |     Args:
389 |         network (_type_): _description_
390 |         input_tensor (_type_): _description_
391 |     """
392 |     relu_layer = network.add_activation(input_tensor, trt.ActivationType.RELU)
393 |     return relu_layer
394 | 
395 | 
396 | def sigmoid(network, input_tensor):
397 |     """sigmoid激活函数
398 | 
399 |     Args:
400 |         network (_type_): _description_
401 |         input_tensor (_type_): _description_
402 |     """
403 |     sig_layer = network.add_activation(input_tensor, trt.ActivationType.SIGMOID)
404 |     return sig_layer
405 | 
406 | 
407 | def softmax(network, input_tensor):
408 |     softmax_layer = network.add_softmax(input_tensor)
409 |     return softmax_layer
410 | 
411 | 
412 | def indice(feat_sz, stride):
413 |     indice_array = (torch.arange(0, feat_sz).unsqueeze(0) * stride).numpy()
414 |     return indice_array
415 | 
416 | # def block_n(network, weights, block_num, input_x_tensor, H_t, W_t, H_s, W_s):
417 | def block_n(network, weights, block_num, input_x_tensor):
418 |     """计算一个block
419 | 
420 |     Args:
421 |         network (_type_): _description_
422 |         weights (_type_): _description_
423 |         block_num (_type_): _description_
424 |         input_x_tensor (_type_): _description_
425 | 
426 |     Returns:
427 |         _type_: _description_
428 |     """
429 |     # 参数设定
430 |     norm1_weight = weights[f"blocks.{block_num}.norm1.weight"]
431 |     norm1_bias = weights[f"blocks.{block_num}.norm1.bias"]
432 |     
433 |     attn_qkv_weight = weights[f"blocks.{block_num}.attn.qkv.weight"]
434 |     attn_qkv_bias = weights[f"blocks.{block_num}.attn.qkv.bias"]
435 | 
436 |     attn_proj_weight = weights[f"blocks.{block_num}.attn.proj.weight"]
437 |     attn_proj_bias = weights[f"blocks.{block_num}.attn.proj.bias"]
438 | 
439 |     norm2_weight = weights[f"blocks.{block_num}.norm2.weight"]
440 |     norm2_bias = weights[f"blocks.{block_num}.norm2.bias"]
441 | 
442 |     mlp_fc1_weight = weights[f"blocks.{block_num}.mlp.fc1.weight"]
443 |     mlp_fc1_bias = weights[f"blocks.{block_num}.mlp.fc1.bias"]
444 | 
445 |     mlp_fc2_weight = weights[f"blocks.{block_num}.mlp.fc2.weight"]
446 |     mlp_fc2_bias = weights[f"blocks.{block_num}.mlp.fc2.bias"]
447 |     # print(f">>> block param: {np.array(attn_qkv_weight).reshape(-1, 768).transpose(1, 0).shape} {np.array(attn_qkv_weight).shape}")
448 |     
449 |     # 网络搭建 
450 |     norm1_layer = layer_norm(network=network, weight=norm1_weight, 
451 |                                    bias=norm1_bias, input_tensor=input_x_tensor, 
452 |                                    exp=2)
453 |     attention_layer = attention(network=network, qkv_weight=attn_qkv_weight,
454 |                                 qkv_bias=attn_qkv_bias, proj_weight=attn_proj_weight,
455 |                                 proj_bias=attn_proj_bias, input_tensor=norm1_layer.get_output(0))
456 |     
457 |     inputx_attn_add_layer = network.add_elementwise(input_x_tensor, attention_layer.get_output(0), trt.ElementWiseOperation.SUM)
458 |     
459 |     norm2_layer = layer_norm(network=network, weight=norm2_weight, 
460 |                                    bias=norm2_bias, input_tensor=inputx_attn_add_layer.get_output(0), 
461 |                                    exp=2)
462 |     # 计算MLP，其由linear+gelu+linear组成
463 |     mlp_layer1 = mlp(network=network, input_tensor=norm2_layer.get_output(0), weight=mlp_fc1_weight, bias=mlp_fc1_bias)
464 |     gelu_layer = gelu(network=network, input_tensor=mlp_layer1.get_output(0))
465 |     mlp_layer2 = mlp(network=network, input_tensor=gelu_layer.get_output(0), weight=mlp_fc2_weight, bias=mlp_fc2_bias)
466 | 
467 |     # x + mlp
468 |     x_mlp_add_layer = network.add_elementwise(inputx_attn_add_layer.get_output(0), mlp_layer2.get_output(0), trt.ElementWiseOperation.SUM)
469 |     # print(f">>> mlp_layer2: {x_mlp_add_layer.get_output(0).shape}")
470 |     return x_mlp_add_layer
471 | 
472 | 
473 | def box_head_ltrb(network, input_tensor, weight):
474 |     """将输入
475 | 
476 |     Args:
477 |         network (_type_): _description_
478 |         input_tensor (_type_): _description_
479 |         weight (_type_): _description_
480 |     """
481 |     # 定义参数
482 |     bh_weight_00 = weight[f'box_head.layers.0.0.weight']
483 |     bh_bias_00 = weight[f'box_head.layers.0.0.bias']
484 |     bh_norm_weight_01 = weight[f'box_head.layers.0.1.weight']
485 |     bh_norm_bias_01 = weight[f'box_head.layers.0.1.bias']
486 | 
487 |     bh_weight_10 = weight[f'box_head.layers.1.0.weight']
488 |     bh_bias_10 = weight[f'box_head.layers.1.0.bias']
489 |     bh_norm_weight_11= weight[f'box_head.layers.1.1.weight']
490 |     bh_norm_bias_11 = weight[f'box_head.layers.1.1.bias']
491 |     
492 |     # 计算 Linear1操作
493 |     linear_layer_1 = mlp(network=network, input_tensor=input_tensor, weight=bh_weight_00, bias=bh_bias_00)
494 |     norm_layer_1 = layer_norm(network=network, weight=bh_norm_weight_01, bias=bh_norm_bias_01, 
495 |                               input_tensor=linear_layer_1.get_output(0), exp=2)
496 |     relu_layer_1 = relu(network=network, input_tensor=norm_layer_1.get_output(0))
497 | 
498 |     # 计算 Linear2操作      
499 |     linear_layer_2 = mlp(network=network, input_tensor=relu_layer_1.get_output(0), weight=bh_weight_10, bias=bh_bias_10)
500 |     norm_layer_2 = layer_norm(network=network, weight=bh_norm_weight_11, bias=bh_norm_bias_11, 
501 |                               input_tensor=linear_layer_2.get_output(0), exp=2)
502 |     softmax_layer =softmax(network=network, input_tensor=norm_layer_2.get_output(0))
503 | 
504 |     indice_array = indice(feat_sz=96, stride=2.3333333333333335)
505 |     mul_layer = mul(network=network, input_tensor=softmax_layer.get_output(0), weight=indice_array, weight_shape=trt.Dims([1, 1, 96]))
506 | 
507 |     reduce_axes = 4 << 0
508 |     reduce_sum_layer = network.add_reduce(mul_layer.get_output(0), trt.ReduceOperation.AVG, reduce_axes, False)
509 |     # print(f">>>box_head: {mul_layer.get_output(0).shape} {reduce_sum_layer.get_output(0).shape}")
510 |     return reduce_sum_layer
511 | 
512 | 
513 | def div(network, input_tensor, weight_const):
514 |     """_summary_
515 | 
516 |     Args:
517 |         network (_type_): _description_
518 |         input_tensor (_type_): _description_
519 |     """
520 |     if len(input_tensor.shape) == 1:
521 |         input_shape = (1,)
522 |     elif len(input_tensor.shape) == 2:
523 |         input_shape = (1, 1)
524 |     elif len(input_tensor.shape) == 3:
525 |         input_shape = (1, 1, 1)
526 |     elif len(input_tensor.shape) == 4:
527 |         input_shape = (1, 1, 1, 1)
528 |     else:
529 |         print(f">>> input_shape is not valid !!!")
530 |         exit(1)
531 |     
532 |     const_div_layer = network.add_constant(input_shape, trt.Weights(np.array([weight_const], dtype=np.float32))).get_output(0)
533 |     div_layer = network.add_elementwise(input_tensor, const_div_layer, trt.ElementWiseOperation.DIV)
534 |     
535 |     return div_layer
536 | 
537 | 
538 | def box_head(network, input_tensor1, input_tensor2, input_tensor3, input_tensor4):
539 |     """ltrt四个输入
540 | 
541 |     Args:
542 |         network (_type_): _description_
543 |         input_tensor1 (_type_): _description_
544 |         input_tensor2 (_type_): _description_
545 |         input_tensor3 (_type_): _description_
546 |         input_tensor4 (_type_): _description_
547 |     """
548 |     concate_layer_1 = network.add_concatenation([input_tensor1, input_tensor2, input_tensor3, input_tensor4])
549 |     concate_layer_1.axis = 1
550 | 
551 |     # div 
552 |     div_layer_1 = div(network=network, input_tensor=concate_layer_1.get_output(0), weight_const=224)
553 | 
554 |     # split
555 |     reshape_layer = reshape(network=network, input_tensor=div_layer_1.get_output(0), new_shape=trt.Dims([1,  1, 4]))
556 |     sp_layer_1, sp_layer_2, sp_layer_3, sp_layer_4 = split4(network=network, input_tensor=reshape_layer.get_output(0), split_list=[1, 1, 1, 1], axis=2)
557 | 
558 |     # add split1 and split3, sub split1 and split3
559 |     add_sp13_layer = network.add_elementwise(sp_layer_1.get_output(0), sp_layer_3.get_output(0), trt.ElementWiseOperation.SUM)
560 |     sub_sp13_layer = network.add_elementwise(sp_layer_1.get_output(0), sp_layer_3.get_output(0), trt.ElementWiseOperation.SUB)
561 |     # add split2 and split4, sub split2 and split4
562 |     add_sp24_layer = network.add_elementwise(sp_layer_2.get_output(0), sp_layer_4.get_output(0), trt.ElementWiseOperation.SUM)
563 |     sub_sp24_layer = network.add_elementwise(sp_layer_2.get_output(0), sp_layer_4.get_output(0), trt.ElementWiseOperation.SUB)
564 |     # div add_sp13 / 2
565 |     div_sp13 = div(network=network, input_tensor=add_sp13_layer.get_output(0), weight_const=2.0)
566 |     # div add_sp24 / 2
567 |     div_sp24 = div(network=network, input_tensor=add_sp24_layer.get_output(0), weight_const=2.0)
568 | 
569 |     concate_layer_2 = network.add_concatenation([div_sp13.get_output(0), sub_sp13_layer.get_output(0), div_sp24.get_output(0), sub_sp24_layer.get_output(0)])
570 |     concate_layer_2.axis = 2
571 | 
572 |     reshape_layer_out = reshape(network=network, input_tensor=concate_layer_2.get_output(0), new_shape=trt.Dims([1, 4]))
573 |     # print(f">>>box_head: {reshape_layer_out.get_output(0).shape}")
574 | 
575 |     return reshape_layer_out
576 | 
577 | 
578 | def score_head(network, input_tensor, weight):
579 |     """通过给定的weight字典取得对应的权重，用于计算score_head结果
580 | 
581 |     Args:
582 |         network (_type_): _description_
583 |         input_tensor (_type_): _description_
584 |         weight (_type_): _description_
585 |     """
586 |     sh_weight_00 = weight['score_head.layers.0.0.weight']
587 |     sh_bias_00 = weight['score_head.layers.0.0.bias']
588 |     sh_weight_1 = weight['score_head.layers.1.weight']
589 |     sh_bias_1 = weight['score_head.layers.1.bias']
590 |     
591 |     # matmul add两步计算使用mmlp计算代替
592 |     mm_add_weight_bias_00_layer = mlp(network=network, input_tensor=input_tensor, weight=sh_weight_00, bias = sh_bias_00)
593 |     relu_layer = relu(network=network, input_tensor=mm_add_weight_bias_00_layer.get_output(0))
594 |     mm_add_weight_bias_11_layer = mlp(network=network, input_tensor=relu_layer.get_output(0), weight=sh_weight_1, bias = sh_bias_1)
595 |     
596 |     reduce_axes = 2 << 0
597 |     reduce_sum_layer = network.add_reduce(mm_add_weight_bias_11_layer.get_output(0), trt.ReduceOperation.AVG, reduce_axes, False)
598 | 
599 |     # 计算sigmoid值
600 |     sig_layer = sigmoid(network=network, input_tensor=reduce_sum_layer.get_output(0))
601 |     score_pred_layer = reshape(network=network, input_tensor=sig_layer.get_output(0), new_shape=trt.Dims([1]))
602 |     print(f">>>score_head: {score_pred_layer.get_output(0).shape}")
603 |     return score_pred_layer
604 | 
605 | 
606 | def construct_network():
607 |    # 读取权重
608 |     wts_path = Path("model/mixformerv2.wts")
609 |     weights = read_wts(wts_path)
610 | 
611 |     EXPLICIT_BATCH = 1 << (int)( 
612 |     trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
613 |     
614 |     TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger() 
615 |     with trt.Builder(TRT_LOGGER) as builder, builder.create_builder_config( 
616 |     ) as config, builder.create_network(EXPLICIT_BATCH) as network: 
617 |         # define input tensor 
618 |         input_img_t = network.add_input( 
619 |             name=IN_NAME1, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H1, IN_W1))
620 |         input_img_ot = network.add_input( 
621 |             name=IN_NAME2, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H2, IN_W2))
622 |         input_img_search = network.add_input( 
623 |             name=IN_NAME3, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H3, IN_W3))
624 |         # print(f">>>input tensor: {type(input_img_ot)}")
625 |         # define backbone
626 |         # first layerDims2
627 |         conv1_w = trt.Weights(np.array(weights['patch_embed.proj.weight']).astype(np.float32).reshape(768, 3, 16, 16))
628 |         conv1_b = trt.Weights(np.array(weights['patch_embed.proj.bias']).astype(np.float32))
629 |         
630 |         conv1 = network.add_convolution_nd(input=input_img_search, num_output_maps=768, kernel_shape=trt.Dims2(16, 16), kernel=conv1_w, bias=conv1_b)
631 |         conv1.stride_nd = trt.Dims2(16, 16)
632 |         conv1.padding_nd = trt.Dims2(0, 0)
633 |         conv1.dilation_nd = trt.Dims2(1, 1)
634 |         conv1.num_groups = 1
635 | 
636 |         conv2 = network.add_convolution_nd(input=input_img_t, num_output_maps=768, kernel_shape=(16, 16), kernel=conv1_w, bias=conv1_b)
637 |         conv2.stride_nd = trt.Dims2(16, 16)
638 |         conv2.padding_nd = trt.Dims2(0, 0)
639 |         conv2.dilation_nd = trt.Dims2(1, 1)
640 |         conv2.num_groups = 1
641 | 
642 |         conv3 = network.add_convolution_nd(input=input_img_ot, num_output_maps=768, kernel_shape=(16, 16), kernel=conv1_w, bias=conv1_b)
643 |         conv3.stride_nd = trt.Dims2(16, 16)
644 |         conv3.padding_nd = trt.Dims2(0, 0)
645 |         conv3.dilation_nd = trt.Dims2(1, 1)
646 |         conv3.num_groups = 1
647 | 
648 |         # reshape1
649 |         conv1_shape = trt.Dims3(1, 768, 196)
650 |         conv2_shape = trt.Dims3(1, 768, 49)
651 |         conv3_shape = trt.Dims3(1, 768, 49)
652 |         reshape_1 = reshape(network=network, input_tensor=conv1.get_output(0), new_shape=conv1_shape)
653 |         reshape_2 = reshape(network=network, input_tensor=conv2.get_output(0), new_shape=conv2_shape)
654 |         reshape_3 = reshape(network=network, input_tensor=conv3.get_output(0), new_shape=conv3_shape)
655 | 
656 |         # transpose1
657 |         perm = [0, 2, 1]
658 |         transpose_1 = transpose(network=network, input_tensor=reshape_1.get_output(0), perm=perm)
659 |         transpose_2 = transpose(network=network, input_tensor=reshape_2.get_output(0), perm=perm)
660 |         transpose_3 = transpose(network=network, input_tensor=reshape_3.get_output(0), perm=perm)
661 | 
662 |         # add1 add2 add3
663 |         pos_embed_s_input_shape = (1, 196, 768)
664 |         pos_embed_t_input_shape = (1, 49, 768)
665 |         
666 |         pos_embed_s = trt.Weights(np.array(weights['pos_embed_s']).reshape(1, 196, 768).astype(np.float32))
667 |         pos_embed_t = trt.Weights(np.array(weights['pos_embed_t']).reshape(1, 49, 768).astype(np.float32))
668 | 
669 |         pos_embed_s_constant = network.add_constant(pos_embed_s_input_shape, pos_embed_s)
670 |         pos_embed_t_constant = network.add_constant(pos_embed_t_input_shape, pos_embed_t)
671 | 
672 |         add_layer_1 = network.add_elementwise(transpose_1.get_output(0), pos_embed_s_constant.get_output(0), trt.ElementWiseOperation.SUM)
673 |         add_layer_2 = network.add_elementwise(transpose_2.get_output(0), pos_embed_t_constant.get_output(0), trt.ElementWiseOperation.SUM)
674 |         add_layer_3 = network.add_elementwise(transpose_3.get_output(0), pos_embed_t_constant.get_output(0), trt.ElementWiseOperation.SUM)
675 | 
676 |         reg_tokens_input_shape = (1, 4, 768)
677 |         reg_tokens =  trt.Weights(np.array(weights['reg_tokens']).reshape(1, 4, 768).astype(np.float32))
678 |         reg_tokens_constant = network.add_constant(reg_tokens_input_shape, reg_tokens)
679 | 
680 |         pos_embed_reg_input_shape = (1, 4, 768)
681 |         pos_embed_reg =  trt.Weights(np.array(weights['reg_tokens']).reshape(1, 4, 768).astype(np.float32))
682 |         pos_embed_reg_constant = network.add_constant(pos_embed_reg_input_shape, pos_embed_reg)
683 | 
684 |         reg_tokens_add_pos_embed_reg = network.add_elementwise(reg_tokens_constant.get_output(0), 
685 |                                                                pos_embed_reg_constant.get_output(0), trt.ElementWiseOperation.SUM)
686 | 
687 |         concat_layer_1 = network.add_concatenation([add_layer_2.get_output(0), add_layer_3.get_output(0), 
688 |                                                     add_layer_1.get_output(0), reg_tokens_add_pos_embed_reg.get_output(0)])
689 |         concat_layer_1.axis = 1
690 | 
691 |         # blocks
692 |         block0_layer = block_n(network=network, weights=weights, block_num=0, input_x_tensor=concat_layer_1.get_output(0))
693 |         block1_layer = block_n(network=network, weights=weights, block_num=1, input_x_tensor=block0_layer.get_output(0))
694 |         block2_layer = block_n(network=network, weights=weights, block_num=2, input_x_tensor=block1_layer.get_output(0))
695 |         block3_layer = block_n(network=network, weights=weights, block_num=3, input_x_tensor=block2_layer.get_output(0))
696 | 
697 |         # box_head
698 |         split_layer_1, split_layer_2, split_layer_3, split_layer_4 =  split4(network=network, input_tensor=block3_layer.get_output(0),
699 |                                                                              split_list=[49, 49, 196, 4], axis=1)
700 |         bh_split_layer_1, bh_split_layer_2, bh_split_layer_3, bh_split_layer_4 = split4(network=network, input_tensor=split_layer_4.get_output(0),
701 |                                                                              split_list=[1, 1, 1, 1], axis=1)
702 |         bh_layer_l = box_head_ltrb(network=network, input_tensor=bh_split_layer_1.get_output(0), weight=weights)
703 |         bh_layer_t = box_head_ltrb(network=network, input_tensor=bh_split_layer_2.get_output(0), weight=weights)
704 |         bh_layer_r = box_head_ltrb(network=network, input_tensor=bh_split_layer_3.get_output(0), weight=weights)
705 |         bh_layer_b = box_head_ltrb(network=network, input_tensor=bh_split_layer_4.get_output(0), weight=weights)
706 |         # pred_boxes
707 |         pred_boxes_layer = box_head(network=network, 
708 |                             input_tensor1=bh_layer_l.get_output(0), input_tensor2=bh_layer_t.get_output(0),
709 |                             input_tensor3=bh_layer_r.get_output(0), input_tensor4=bh_layer_b.get_output(0))
710 |         
711 |         # score head
712 |         pred_scores_layer = score_head(network=network, input_tensor=split_layer_4.get_output(0), weight=weights)
713 |         
714 |         # 设置并标记输出
715 |         pred_boxes_layer.get_output(0).name = OUT_NAME1
716 |         pred_scores_layer.get_output(0).name = OUT_NAME2
717 |         network.mark_output(pred_boxes_layer.get_output(0))
718 |         network.mark_output(pred_scores_layer.get_output(0))
719 | 
720 |     # config = builder.create_builder_config()
721 |     # config.max_workspace_size = 1 << 20
722 |     # engine = builder.build_engine(network, config)
723 |     #step3：创建config并设置最大batchsize和最大工作空间
724 |     with builder.create_builder_config() as config:
725 |         config.max_workspace_size = 4 << 20
726 |         
727 |     #step4：创建engine
728 |     engine = builder.build_engine(network, config)
729 | 
730 |     #step5:序列化保存engine到planfile
731 |     with open('model/mixformer_v2.engine', 'wb') as f:
732 |         f.write(engine.serialize())
733 | 
734 | 
735 | if __name__=="__main__":
736 |     construct_network()


--------------------------------------------------------------------------------
/mixformer-pytrt/mixformer_nvinfer.py:
--------------------------------------------------------------------------------
  1 | import tensorrt as trt
  2 | from collections import OrderedDict, namedtuple
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import sys
  7 | import os
  8 | import logging
  9 | import logging.config
 10 | import time
 11 | import pkg_resources as pkg
 12 | 
 13 | 
 14 | TRT_LOGGER = trt.Logger()
 15 | trt.init_libnvinfer_plugins(TRT_LOGGER, '')
 16 | 
 17 | # # 检查注册的操作
 18 | # def get_plugin_names():
 19 | #     return [pc.name for pc in trt.get_plugin_registry().plugin_creator_list] 
 20 | # print('检查注册算子>> ', get_plugin_names())
 21 | 
 22 | LOGGING_NAME="MixformerNvinfer"
 23 | LOGGER = logging.getLogger(LOGGING_NAME)
 24 | 
 25 | ENGINE_TYPE=['mixformer_v2', 'mixformer_v2_int32', 'mixformer_v2_sim']
 26 | 
 27 | 
 28 | class MixformerNvinfer:
 29 |     """Mixformer Nvinfer
 30 |     """
 31 |     def __init__(self, engine_name="mixformer_v2_sim") -> None:
 32 | 
 33 |         # 检查输入的engine_type
 34 |         assert engine_name in ENGINE_TYPE, "please check the engine_type whether is in ENGINE_TYPE=['bacbone_neck_x', \
 35 |             'backbone_neck_z', 'featfusor_head', 'mixformer_v2', 'mixformer_v2_int32', 'mixformer_v2_sim']."
 36 | 
 37 |         self.device = torch.device('cuda:0')
 38 |         
 39 |         # 根据输入engine类型得到对应类型的模型
 40 |         self.engine_path = os.path.join("model", engine_name + '.engine')
 41 |         if not os.path.exists(self.engine_path):
 42 |             LOGGER.info(f"Error ENGINE_NAME: {engine_name}")
 43 |             sys.exit(1)
 44 |         
 45 |         # LOGGER.info(f"loading {self.engine_path} for TensorRT inference.")
 46 |         print(f"loading {self.engine_path} for TensorRT inference.")
 47 |         self.check_version(trt.__version__, '7.0.0', hard=True)
 48 | 
 49 |         # 定义绑定数据
 50 |         self.Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
 51 | 
 52 |         # 定义logger
 53 |         self.logger = trt.Logger(trt.Logger.INFO)
 54 | 
 55 |         # 反序列化engine文件
 56 |         with open(self.engine_path, 'rb') as f, trt.Runtime(self.logger) as runtime:
 57 |             self.model = runtime.deserialize_cuda_engine(f.read())
 58 |         # runtime = trt.Runtime(self.logger)
 59 |         # with open(self.engine_path, "rb") as f:
 60 |         #     serialized_engine = f.read()
 61 |         # self.model = runtime.deserialize_cuda_engine(serialized_engine)
 62 |         
 63 |         # 创建上下文
 64 |         self.context = self.model.create_execution_context()
 65 |         self.bindings = OrderedDict()
 66 |         self.output_names = []
 67 |         # self.fp16 = False
 68 |         # self.dynamic = False
 69 |         # binding_names = [self.model.get_binding_name(i) for i in range(self.model.num_bindings)]
 70 |         # print(f">>>model numbindings: {self.model.num_bindings} {binding_names}")
 71 |         for i in range(self.model.num_bindings):
 72 |             name = self.model.get_binding_name(i)
 73 |             dtype = trt.nptype(self.model.get_binding_dtype(i))
 74 |             # LOGGER.info(f"name: {name, dtype}")
 75 |             print(f"name: {name, dtype}")
 76 |             # input
 77 |             if self.model.binding_is_input(i):
 78 |                 if -1 in tuple(self.model.get_binding_shape(i)):
 79 |                     dynamic = True
 80 |                     self.context.set_binding_shape(i, tuple(self.model.get_profile_shape(0,1)[2]))
 81 |                 if dtype == np.float16:
 82 |                     fp16 = True
 83 |             else: # output
 84 |                 # print(f">>>output name: {name} {self.model.get_binding_shape(i)} {dtype}")
 85 |                 self.output_names.append(name)
 86 | 
 87 |             shape = tuple(self.context.get_binding_shape(i))
 88 |             im = torch.from_numpy(np.empty(shape, dtype)).to(self.device)
 89 | 
 90 |             # 绑定输入输出数据
 91 |             self.bindings[name] = self.Binding(name, dtype, shape, im, int(im.data_ptr()))
 92 | 
 93 |         # LOGGER.info(f"input and output's name and addr: {OrderedDict((n, d.ptr) for n, d in self.bindings.items())}")
 94 |         # 记录input_0，output_0, output_1的名称和地址
 95 |         self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
 96 |         self.batch_size = self.bindings['img_search'].shape[0]
 97 | 
 98 |     def infer(self, im, im_0, im_1):
 99 |         """输入图像进行trt推理
100 | 
101 |         Args:
102 |             im (_type_): _description_
103 |             augment (bool, optional): _description_. Defaults to False.
104 |             visualize (bool, optional): _description_. Defaults to False.
105 | 
106 |         Returns:
107 |             _type_: _description_
108 |         """
109 |         # print(f">>> binding_addrs keys: {self.binding_addrs.values()}") # ==
110 |         # 将实际输入的图像地址取出，并赋值给binding_addr
111 |         self.binding_addrs['img_t'] = int(im.data_ptr())
112 |         self.binding_addrs['img_ot'] = int(im_0.data_ptr())
113 |         self.binding_addrs['img_search'] = int(im_1.data_ptr())
114 |         # print(f">>> binding_addrs keys1: {self.binding_addrs.values()}") # ==
115 |         # 将绑定的地址地址传递给context，使用execute_v2进行推理，推理结果就会保存在对应的地址中，通过访问对应地址的数据就能得到输出
116 |         self.context.execute_v2(list(self.binding_addrs.values()))
117 |         y = [self.bindings[x].data for x in sorted(self.output_names)]
118 | 
119 |         if isinstance(y, (list, tuple)):
120 |             return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
121 |         else:
122 |             return self.from_numpy(y)
123 |         
124 |     def from_numpy(self, x):
125 |         return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x
126 | 
127 | 
128 |     def check_version(self, current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False, verbose=False):
129 |         # Check version vs. required version
130 |         current, minimum = (pkg.parse_version(x) for x in (current, minimum))
131 |         result = (current == minimum) if pinned else (current >= minimum)  # bool
132 |         s = f'WARNING ⚠️ {name}{minimum} is required by YOLOv5, but {name}{current} is currently installed'  # string
133 |         if hard:
134 |             assert result, s  # assert min requirements met
135 |         if verbose and not result:
136 |             LOGGER.warning(s)
137 |         return result
138 | 
139 | # class MixformerNvinfer:
140 | #     def __init__(self, engine_path=None) -> None:
141 | #         assert engine_path is not None, "engine path is None"
142 | 
143 | #         # 1. 创建TensorRT的runtime
144 | #         TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
145 | #         runtime = trt.Runtime(TRT_LOGGER)
146 | 
147 | #         # 2. 从文件中加载engine
148 | #         with open(engine_path, "rb") as f:
149 | #             engine_data = f.read()
150 | #         self.engine = runtime.deserialize_cuda_engine(engine_data)
151 |         
152 | #         # 3. 创建执行上下文并进行推理
153 | #         self.context = self.engine.create_execution_context()
154 | 
155 | #     def infer(self, img_t, img_ot, img_search):
156 | 
157 | 
158 | 
159 | if __name__=="__main__":
160 |     det = MixformerNvinfer(ENGINE_TYPE[2])
161 | 
162 |     input= torch.rand((1, 3, 112, 112)).cuda()
163 |     input0= torch.rand((1, 3, 112, 112)).cuda()
164 |     input1= torch.rand((1, 3, 224, 224)).cuda()
165 | 
166 |     warmup_N = 100
167 |     N = 1000
168 |     for i in range(warmup_N):
169 |         output = det.infer(input, input0, input1)
170 |     
171 |     start = time.time()
172 |     for i in range(N):
173 |         start_i = time.time()
174 |         output = det.infer(input, input0, input1)
175 |         # print(f">>>single infer time: {1 / (time.time() - start_i)} FPS")
176 | 
177 |     print(f">>>infer time: {1 / ((time.time() - start) / N)} FPS")
178 | 
179 |     print(f"output's length is {output[0].shape} {output[1].shape}")


--------------------------------------------------------------------------------
/mixformer-pytrt/onnx2trt.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import torch
  3 | import torch.utils.data
  4 | import torch.onnx
  5 | import onnx
  6 | import tensorrt as trt
  7 | import inspect
  8 | import logging
  9 | import logging.config as log_config
 10 | import os
 11 | import platform
 12 | import time
 13 | from pathlib import Path
 14 | 
 15 | 
 16 | LOGGING_NAME = "mftrack"
 17 | 
 18 | 
 19 | def set_logging(name=LOGGING_NAME, verbose=True):
 20 |     # sets up logging for the given name
 21 |     rank = int(os.getenv('RANK', -1))  # rank in world for Multi-GPU trainings
 22 |     level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR
 23 |     log_config.dictConfig({
 24 |         "version": 1,
 25 |         "disable_existing_loggers": False,
 26 |         "formatters": {
 27 |             name: {
 28 |                 "format": "%(message)s"}},
 29 |         "handlers": {
 30 |             name: {
 31 |                 "class": "logging.StreamHandler",
 32 |                 "formatter": name,
 33 |                 "level": level,}},
 34 |         "loggers": {
 35 |             name: {
 36 |                 "level": level,
 37 |                 "handlers": [name],
 38 |                 "propagate": False,}}})
 39 | 
 40 | 
 41 | set_logging(LOGGING_NAME)  # run before defining LOGGER
 42 | LOGGER = logging.getLogger(LOGGING_NAME)  # define globally (used in train.py, val.py, detect.py, etc.)
 43 | if platform.system() == 'Windows':
 44 |     for fn in LOGGER.info, LOGGER.warning:
 45 |         setattr(LOGGER, fn.__name__, lambda x: fn(emojis(x)))  # emoji safe logging
 46 | 
 47 | def emojis(str=''):
 48 |     # Return platform-dependent emoji-safe version of string
 49 |     return str.encode().decode('ascii', 'ignore') if platform.system() == 'Windows' else str
 50 | 
 51 | def get_data(bs, sz):
 52 |     image = torch.randn(bs, 3, sz, sz).cuda()
 53 |     return image
 54 | 
 55 | 
 56 | def to_numpy(tensor):
 57 |     return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 58 | 
 59 | 
 60 | def export_engine(f_onnx, half=False, workspace=4, verbose=False, prefix="TensorRT"):
 61 |     """使用onnx_parser解析onnx模型, 编译得到engine进行推理
 62 | 
 63 |     Args:
 64 |         f_onnx (str): ONNX文件的路径
 65 |         half (bool, optional): 是否使用FP16模式. 默认为False.
 66 |         workspace (int, optional): 最大工作空间大小(GB). 默认为4.
 67 |         verbose (bool, optional): 是否启用详细日志. 默认为False.
 68 |         prefix (str, optional): 日志前缀. 默认为"TensorRT".
 69 |     """
 70 |     f = "model/mixformer_v2_sim.engine"
 71 | 
 72 |     assert Path(f_onnx).exists(), f'NOt found ONNX file: {f_onnx}' 
 73 |     model = onnx.load(f_onnx)
 74 |     onnx.checker.check_model(model)
 75 | 
 76 |     logger = trt.Logger(trt.Logger.INFO)
 77 |     if verbose:
 78 |         logger.min_serverity = trt.Logger.Serverity.VERBOSE
 79 | 
 80 |     builder = trt.Builder(logger)
 81 |     config = builder.create_builder_config()
 82 |     config.max_workspace_size = workspace * 1 << 30
 83 |     
 84 |     print(f'{prefix} building FP{16 if builder.platform_has_fast_fp16 and half else 32} engine as {f}')
 85 |     if builder.platform_has_fast_fp16 and half:
 86 |         print(f"转换为FP16模型.")
 87 |         config.set_flag(trt.BuilderFlag.FP16)
 88 | 
 89 |     flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
 90 | 
 91 |     network = builder.create_network(flag)
 92 |     parser = trt.OnnxParser(network, logger)
 93 |     if not parser.parse_from_file(str(f_onnx)):
 94 |         raise RuntimeError(f'failed to load ONNX file: {f_onnx}')
 95 |     
 96 |     inputs = [network.get_input(i) for i in range(network.num_inputs)]
 97 |     outputs = [network.get_output(i) for i in range(network.num_outputs)]
 98 |     for inp in inputs:
 99 |         print(f'{prefix} input "{inp.name}" with shape{inp.shape} {inp.dtype}')
100 |     for out in outputs:
101 |         print(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}')
102 |         
103 |     profile = builder.create_optimization_profile()
104 |     config.add_optimization_profile(profile)
105 |      
106 |     engine = builder.build_serialized_network(network, config)
107 |     with open(f, 'wb') as t:
108 |         t.write(engine)
109 |     
110 | def main():
111 |     export_engine(f_onnx=Path('model/mixformer_v2_sim.onnx'), half=False, verbose=False)
112 |     # export_engine(f_onnx=Path('model/mixformer_v2_sim_fp16.onnx'), half=True, verbose=False)
113 | 
114 | if __name__ == '__main__':
115 |     main()
116 | 


--------------------------------------------------------------------------------
/mixformer_onnx.cpp:
--------------------------------------------------------------------------------
  1 | // 
  2 | // Create by Daniel Lee on 2023/9/22
  3 | // 
  4 | #include <cstdlib>
  5 | #include <string>
  6 | #include <cmath>
  7 | #include "mixformer_onnx.h"
  8 | 
  9 | #define TIME
 10 | #ifdef TIME
 11 | #include <sys/time.h>
 12 | #endif
 13 | 
 14 | #ifdef TIME
 15 |     struct timeval tv;
 16 |     uint64_t time_last;
 17 |     double time_ms;
 18 | #endif
 19 | 
 20 | // put z and x into transform
 21 | std::vector<Ort::Value>  Mixformer::transform(const cv::Mat &mat_z, const cv::Mat &mat_oz, const cv::Mat &mat_x)
 22 | {
 23 |     cv::Mat z = mat_z.clone();
 24 |     cv::Mat oz = mat_oz.clone();
 25 |     cv::Mat x = mat_x.clone();
 26 | 
 27 |     cv::cvtColor(z, z, cv::COLOR_BGR2RGB);
 28 |     cv::cvtColor(oz, oz, cv::COLOR_BGR2RGB);
 29 |     cv::cvtColor(x, x, cv::COLOR_BGR2RGB);
 30 |     // z.convertTo(z, CV_32FC3, 1. / 255.f, 0.f);
 31 |     // x.convertTo(x, CV_32FC3, 1. / 255.f, 0.f);
 32 |     
 33 |     this->normalize_inplace(z, means, norms); // float32
 34 |     this->normalize_inplace(oz, means, norms); // float32
 35 |     this->normalize_inplace(x, means, norms); // float32
 36 | 
 37 |     std::vector<Ort::Value> input_tensors;
 38 | 
 39 |     input_tensors.emplace_back(this->create_tensor(
 40 |         z, input_node_dims.at(0), memory_info_handler,
 41 |         input_value_handler_z, CHW));
 42 | 
 43 |     input_tensors.emplace_back(this->create_tensor(
 44 |         oz, input_node_dims.at(1), memory_info_handler,
 45 |         input_value_handler_oz, CHW));
 46 | 
 47 |     input_tensors.emplace_back(this->create_tensor(
 48 |         x, input_node_dims.at(2), memory_info_handler,
 49 |         input_value_handler_x, CHW));
 50 | 
 51 |     return input_tensors;
 52 | }
 53 | 
 54 | void Mixformer::init(const cv::Mat &img, DrOBB bbox)
 55 | {
 56 |     // get subwindow
 57 |     cv::Mat z_patch;
 58 |     float resize_factor = 1.f;
 59 |     this->sample_target(img, z_patch, bbox.box, this->cfg.template_factor, this->cfg.template_size, resize_factor);
 60 |     this->z_patch = z_patch;
 61 |     this->oz_patch = z_patch;
 62 |     this->state = bbox.box;
 63 | }
 64 | 
 65 | const DrOBB &Mixformer::track(const cv::Mat &img)
 66 | {
 67 |     // if (img.empty()) return;
 68 |     // get subwindow
 69 |     cv::Mat x_patch;
 70 |     this->frame_id += 1;
 71 |     float resize_factor = 1.f;
 72 |     this->sample_target(img, x_patch, this->state, this->cfg.search_factor, this->cfg.search_size, resize_factor);
 73 | 
 74 |     // preprocess input tensor
 75 |     std::vector<Ort::Value> input_tensor_xz = this->transform(this->z_patch, this->oz_patch, x_patch);
 76 |     // std::cout << "开始跟踪1: " << input_tensor_xz.size()<< std::endl;
 77 |     // inference score， size  and offsets
 78 |     std::vector<Ort::Value> output_tensors = ort_session->Run(
 79 |         Ort::RunOptions{nullptr}, input_node_names.data(),
 80 |         input_tensor_xz.data(), 3, output_node_names.data(), 2
 81 |     );
 82 |     // std::cout << "开始跟踪2: " << std::endl;
 83 |     DrBBox pred_box;
 84 |     float pred_score;
 85 |     this->cal_bbox(output_tensors, pred_box, pred_score, resize_factor);
 86 |     
 87 |     this->map_box_back(pred_box, resize_factor);
 88 |     this->clip_box(pred_box, img.rows, img.cols, 10);
 89 |     
 90 |     object_box.box = pred_box;
 91 |     object_box.class_id = 0;
 92 |     object_box.score = pred_score;
 93 | 
 94 |     this->state = object_box.box;
 95 | 
 96 |     this->max_pred_score = this->max_pred_score * this->max_score_decay;
 97 |     // update template
 98 |     if (pred_score > 0.5 && pred_score > this->max_pred_score)
 99 |     {
100 |       this->sample_target(img, this->max_oz_patch, this->state, this->cfg.template_factor, this->cfg.template_size, resize_factor);
101 |       this->max_pred_score = pred_score;
102 | 
103 |     }
104 | 
105 |     if (this->frame_id % this->cfg.update_interval == 0)
106 |     {
107 |       this->oz_patch = this->max_oz_patch;
108 |       this->max_pred_score = -1;
109 |       this->max_oz_patch = this->oz_patch;
110 |     }
111 | 
112 |     return object_box;
113 | }
114 | 
115 | // calculate bbox
116 | void Mixformer::cal_bbox(std::vector<Ort::Value> &output_tensors, DrBBox &pred_box, float &max_score, float resize_factor) {
117 |     Ort::Value &boxes_tensor = output_tensors.at(0); // (1，1，4)
118 |     Ort::Value &scores_tensor = output_tensors.at(1); // (1)
119 | 
120 |     auto scores_ptr = scores_tensor.GetTensorData<float>();    
121 |     auto boxes_ptr = boxes_tensor.GetTensorData<float>();
122 |     // auto dims = boxes_tensor.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
123 |     // std::cout << "boxes_shape: " << boxes_ptr[0] << std::endl;
124 |     auto cx = boxes_ptr[0];
125 |     auto cy = boxes_ptr[1];
126 |     auto w = boxes_ptr[2];
127 |     auto h = boxes_ptr[3];
128 |     std::cout << "cx cy w h "<< cx << " " << cy << " " << w << " " << h << std::endl;
129 |     cx = cx * this->cfg.search_size / resize_factor;
130 |     cy = cy * this->cfg.search_size / resize_factor;
131 |     w = w * this->cfg.search_size / resize_factor;
132 |     h = h * this->cfg.search_size / resize_factor;
133 |     
134 |     pred_box.x0 = cx - 0.5 * w;
135 |     pred_box.y0 = cy - 0.5 * h;
136 |     pred_box.x1 = pred_box.x0 + w;
137 |     pred_box.y1 = pred_box.y0 + h;
138 | 
139 |     max_score = scores_ptr[0];
140 | }
141 | 
142 | void Mixformer::map_box_back(DrBBox &pred_box, float resize_factor) {
143 |     float cx_prev = this->state.x0 + 0.5 * (this->state.x1 - this->state.x0);
144 |     float cy_prev = this->state.y0 + 0.5 * (this->state.y1 - this->state.y0);
145 | 
146 |     float half_side = 0.5 * this->cfg.search_size / resize_factor;
147 | 
148 |     float w = pred_box.x1 - pred_box.x0;
149 |     float h = pred_box.y1 - pred_box.y0;
150 |     float cx = pred_box.x0 + 0.5 * w;
151 |     float cy = pred_box.y0 + 0.5 * h;
152 | 
153 |     float cx_real = cx + (cx_prev - half_side);
154 |     float cy_real = cy + (cy_prev - half_side);
155 | 
156 |     pred_box.x0 = cx_real - 0.5 * w;
157 |     pred_box.y0 = cy_real - 0.5 * h;
158 |     pred_box.x1 = cx_real + 0.5 * w;
159 |     pred_box.y1 = cy_real + 0.5 * h;
160 | }
161 | 
162 | void Mixformer::clip_box(DrBBox &box, int height, int wight, int margin) {
163 |     box.x0 = std::min(std::max(0, int(box.x0)), wight - margin);
164 |     box.y0 = std::min(std::max(0, int(box.y0)), height - margin);
165 |     box.x1 = std::min(std::max(margin, int(box.x1)), wight);
166 |     box.y1 = std::min(std::max(margin, int(box.y1)), height);
167 | }
168 | 
169 | void Mixformer::sample_target(const cv::Mat &im, cv::Mat &croped, DrBBox target_bb, float search_area_factor, int output_sz, float &resize_factor) {
170 |     /* Extracts a square crop centrered at target_bb box, of are search_area_factor^2 times target_bb area
171 | 
172 |     args:
173 |         im: Img image
174 |         target_bb - target box [x0, y0, x1, y1]
175 |         search_area_factor - Ratio of crop size to target size
176 |         output_sz - Size
177 |     
178 |     */
179 |    int x = target_bb.x0;
180 |    int y = target_bb.y0;
181 |    int w = target_bb.x1 - target_bb.x0;
182 |    int h = target_bb.y1 - target_bb.y0;
183 |    int crop_sz = std::ceil(std::sqrt(w *h) * search_area_factor);
184 | 
185 |    float cx = x + 0.5 * w;
186 |    float cy = y + 0.5 * h;
187 |    int x1 = std::round(cx - crop_sz * 0.5);
188 |    int y1 = std::round(cy - crop_sz * 0.5);
189 | 
190 |    int x2 = x1 + crop_sz;
191 |    int y2 = y1 + crop_sz;
192 | 
193 |    int x1_pad = std::max(0, -x1);
194 |    int x2_pad = std::max(x2 - im.cols +1, 0);
195 |    
196 |    int y1_pad = std::max(0, -y1);
197 |    int y2_pad = std::max(y2- im.rows + 1, 0);
198 | 
199 |    // Crop target
200 |    cv::Rect roi_rect(x1+x1_pad, y1+y1_pad, (x2-x2_pad)-(x1+x1_pad), (y2-y2_pad)-(y1+y1_pad));
201 |    cv::Mat roi = im(roi_rect);
202 | 
203 |    // Pad
204 |    cv::copyMakeBorder(roi, croped, y1_pad, y2_pad, x1_pad, x2_pad, cv::BORDER_CONSTANT);
205 | 
206 |    // Resize
207 |    cv::resize(croped, croped, cv::Size(output_sz, output_sz));
208 | 
209 |    resize_factor = output_sz * 1.f / crop_sz;
210 | }
211 | 
212 | Ort::Value Mixformer::create_tensor(const cv::Mat &mat, 
213 |         const std::vector<int64_t> &tensor_dims, 
214 |         const Ort::MemoryInfo &memory_info_handler,
215 |         std::vector<float> &tensor_value_handler,
216 |         unsigned int data_format = CHW)
217 | throw(std::runtime_error)
218 | {
219 |   const unsigned int rows = mat.rows;
220 |   const unsigned int cols = mat.cols;
221 |   const unsigned int channels = mat.channels();
222 | 
223 |   cv::Mat mat_ref;
224 |   if (mat.type() != CV_32FC(channels)) mat.convertTo(mat_ref, CV_32FC(channels));
225 |   else mat_ref = mat; // reference only. zero-time cost. support 1/2/3/... channels
226 | 
227 |   if (tensor_dims.size() != 4) throw std::runtime_error("dims mismatch.");
228 |   if (tensor_dims.at(0) != 1) throw std::runtime_error("batch != 1");
229 | 
230 |   // CXHXW
231 |   if (data_format == CHW)
232 |   {
233 | 
234 |     const unsigned int target_height = tensor_dims.at(2);
235 |     const unsigned int target_width = tensor_dims.at(3);
236 |     const unsigned int target_channel = tensor_dims.at(1);
237 |     const unsigned int target_tensor_size = target_channel * target_height * target_width;
238 |     if (target_channel != channels) throw std::runtime_error("channel mismatch.");
239 | 
240 |     tensor_value_handler.resize(target_tensor_size);
241 | 
242 |     cv::Mat resize_mat_ref;
243 |     if (target_height != rows || target_width != cols)
244 |       cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));
245 |     else resize_mat_ref = mat_ref; // reference only. zero-time cost.
246 | 
247 |     std::vector<cv::Mat> mat_channels;
248 |     cv::split(resize_mat_ref, mat_channels);
249 |     // CXHXW
250 |     for (unsigned int i = 0; i < channels; ++i)
251 |       std::memcpy(tensor_value_handler.data() + i * (target_height * target_width),
252 |                   mat_channels.at(i).data,target_height * target_width * sizeof(float));
253 | 
254 |     return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),
255 |                                            target_tensor_size, tensor_dims.data(),
256 |                                            tensor_dims.size());
257 |   }
258 | 
259 |   // HXWXC
260 |   const unsigned int target_height = tensor_dims.at(1);
261 |   const unsigned int target_width = tensor_dims.at(2);
262 |   const unsigned int target_channel = tensor_dims.at(3);
263 |   const unsigned int target_tensor_size = target_channel * target_height * target_width;
264 |   if (target_channel != channels) throw std::runtime_error("channel mismatch!");
265 |   tensor_value_handler.resize(target_tensor_size);
266 | 
267 |   cv::Mat resize_mat_ref;
268 |   if (target_height != rows || target_width != cols)
269 |     cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));
270 |   else resize_mat_ref = mat_ref; // reference only. zero-time cost.
271 | 
272 |   std::memcpy(tensor_value_handler.data(), resize_mat_ref.data, target_tensor_size * sizeof(float));
273 | 
274 |   return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),
275 |                                          target_tensor_size, tensor_dims.data(),
276 |                                          tensor_dims.size());
277 | }
278 | 
279 | cv::Mat Mixformer::normalize(const cv::Mat &mat, float mean, float scale)
280 | {
281 |   cv::Mat matf;
282 |   if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);
283 |   else matf = mat; // reference
284 |   return (matf - mean) * scale;
285 | }
286 | 
287 | cv::Mat Mixformer::normalize(const cv::Mat &mat, const float mean[3], const float scale[3])
288 | {
289 |   cv::Mat mat_copy;
290 |   if (mat.type() != CV_32FC3) mat.convertTo(mat_copy, CV_32FC3);
291 |   else mat_copy = mat.clone();
292 |   for (unsigned int i = 0; i < mat_copy.rows; ++i)
293 |   {
294 |     cv::Vec3f *p = mat_copy.ptr<cv::Vec3f>(i);
295 |     for (unsigned int j = 0; j < mat_copy.cols; ++j)
296 |     {
297 |       p[j][0] = (p[j][0] - mean[0]) * scale[0];
298 |       p[j][1] = (p[j][1] - mean[1]) * scale[1];
299 |       p[j][2] = (p[j][2] - mean[2]) * scale[2];
300 |     }
301 |   }
302 |   return mat_copy;
303 | }
304 | 
305 | void Mixformer::normalize(const cv::Mat &inmat, cv::Mat &outmat, float mean, float scale)
306 | {
307 |   outmat = this->normalize(inmat, mean, scale);
308 | }
309 | 
310 | void Mixformer::normalize_inplace(cv::Mat &mat_inplace, float mean, float scale)
311 | {
312 |   if (mat_inplace.type() != CV_32FC3) mat_inplace.convertTo(mat_inplace, CV_32FC3);
313 |   this->normalize(mat_inplace, mat_inplace, mean, scale);
314 | }
315 | 
316 | void Mixformer::normalize_inplace(cv::Mat &mat_inplace, const float mean[3], const float scale[3])
317 | {
318 |   if (mat_inplace.type() != CV_32FC3) mat_inplace.convertTo(mat_inplace, CV_32FC3);
319 |   for (unsigned int i = 0; i < mat_inplace.rows; ++i)
320 |   {
321 |     cv::Vec3f *p = mat_inplace.ptr<cv::Vec3f>(i);
322 |     for (unsigned int j = 0; j < mat_inplace.cols; ++j)
323 |     {
324 |       p[j][0] = (p[j][0] - mean[0]) * scale[0];
325 |       p[j][1] = (p[j][1] - mean[1]) * scale[1];
326 |       p[j][2] = (p[j][2] - mean[2]) * scale[2];
327 |     }
328 |   }
329 | }
330 | 
331 | Mixformer::Mixformer(const std::string &_onnx_path, unsigned int _num_threads):
332 |     log_id(_onnx_path.data()), num_threads(_num_threads)
333 | {
334 | #ifdef LITE_WIN32
335 |   std::wstring _w_onnx_path(lite::utils::to_wstring(_onnx_path));
336 |   onnx_path = _w_onnx_path.data();
337 | #else
338 |   onnx_path = _onnx_path.data();
339 | #endif
340 |   ort_env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, log_id);
341 |   // 0. session options
342 |   Ort::SessionOptions session_options;
343 |   session_options.SetIntraOpNumThreads(num_threads);
344 |   session_options.SetGraphOptimizationLevel(
345 |       GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
346 |   session_options.SetLogSeverityLevel(4);
347 |   // 1. session
348 |   // GPU Compatibility.
349 | #ifdef USE_CUDA
350 |   OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0); // C API stable.
351 | #endif
352 |   ort_session = new Ort::Session(ort_env, onnx_path, session_options);
353 | }
354 | 
355 | Mixformer::~Mixformer()
356 | {
357 |   if (ort_session)
358 |     delete ort_session;
359 |   ort_session = nullptr;
360 | }
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 


--------------------------------------------------------------------------------
/mixformer_onnx.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Create by Daniel Lee on 2023/9/22
  3 | //
  4 | 
  5 | #ifndef MIXFORMER_H
  6 | #define MIXFORMER_H
  7 | 
  8 | #include <vector> 
  9 | #include <map>
 10 | #include <memory>
 11 | #include "onnxruntime/core/session/onnxruntime_cxx_api.h"
 12 | #include "onnxruntime/core/providers/cuda/cuda_provider_factory.h"
 13 | 
 14 | #include "opencv2/opencv.hpp"
 15 | #include "opencv2/highgui/highgui.hpp"
 16 | #include "opencv2/imgproc/imgproc.hpp"
 17 | 
 18 | # define USE_CUDA
 19 | 
 20 | struct DrBBox {
 21 |     float x0;
 22 |     float y0;
 23 |     float x1;
 24 |     float y1;
 25 | };
 26 | 
 27 | struct DrOBB {
 28 |     DrBBox box;
 29 |     float score;
 30 |     int class_id;
 31 | };
 32 | 
 33 | struct Config {
 34 |     // std::vector<float> window;
 35 |     float template_factor = 2.0;
 36 |     float search_factor = 4.5; // 5.0
 37 |     float template_size = 112; //192
 38 |     float search_size = 224; // 384
 39 |     float stride = 16;
 40 |     int feat_sz = 14; // 24
 41 |     int update_interval = 200;
 42 | };
 43 | enum
 44 | {
 45 |     CHW = 0, HWC = 1
 46 | };
 47 | 
 48 | 
 49 | class Mixformer {
 50 | 
 51 | public:
 52 |     Ort::Env ort_env;
 53 |     Ort::Session *ort_session = nullptr;
 54 |     // CPU MemoryInfo
 55 |     Ort::AllocatorWithDefaultOptions allocator;
 56 |     Ort::MemoryInfo memory_info_handler = Ort::MemoryInfo::CreateCpu(
 57 |         OrtArenaAllocator, OrtMemTypeDefault);
 58 |      // hardcode input node names
 59 |     unsigned int num_inputs = 3;
 60 |     std::vector<const char *> input_node_names = {
 61 |         "img_t",
 62 |         "img_ot",
 63 |         "img_search"
 64 |     };
 65 |     // init dynamic input dims
 66 |     std::vector<std::vector<int64_t>> input_node_dims = {
 67 |         {1, 3, 112, 112}, // z  (b=1,c,h,w)
 68 |         {1, 3, 112, 112}, // z  (b=1,c,h,w)
 69 |         {1, 3, 224, 224} // x
 70 |     }; 
 71 |     std::vector<float> input_value_handler_z;
 72 |     std::vector<float> input_value_handler_oz;
 73 |     std::vector<float> input_value_handler_x;
 74 | 
 75 |     // hardcode output node names
 76 |     unsigned int num_outputs = 3;
 77 |     std::vector<const char *> output_node_names = {
 78 |         "pred_boxes",
 79 |         "pred_scores"
 80 |     };
 81 | 
 82 |     const char *onnx_path = nullptr;
 83 |     
 84 |     const char *log_id = nullptr;
 85 | 
 86 | public:     
 87 |     explicit Mixformer(const std::string &_onnx_path, unsigned int _num_threads = 8);
 88 | 
 89 |     ~Mixformer(); //override
 90 | 
 91 |     void init(const cv::Mat &img, DrOBB bbox);
 92 |     
 93 |     const DrOBB &track(const cv::Mat &img);
 94 |     
 95 |     // state  dynamic
 96 |     DrBBox state;
 97 |     
 98 |     // config static
 99 |     Config cfg; 
100 | 
101 | protected:
102 |     const unsigned int num_threads; // initialize at runtime.
103 | 
104 | private:
105 | 
106 |     std::vector<Ort::Value>  transform(const cv::Mat &mat_z, const cv::Mat &mat_oz, const cv::Mat &mat_x);
107 | 
108 |     void map_box_back(DrBBox &pred_box, float resize_factor);
109 | 
110 |     void clip_box(DrBBox &box, int height, int wight, int margin);
111 | 
112 |     void cal_bbox(std::vector<Ort::Value> &output_tensors, DrBBox &pred_box, float &max_score, float resize_factor);
113 | 
114 |     void sample_target(const cv::Mat &im, cv::Mat &croped, DrBBox target_bb, float search_area_factor, int output_sz, float &resize_factor);
115 | 
116 | public:
117 | 
118 |     Ort::Value create_tensor(const cv::Mat &mat, const std::vector<int64_t> &tensor_dims,
119 |                             const Ort::MemoryInfo &memory_info_handler,
120 |                             std::vector<float> &tensor_value_handler,
121 |                             unsigned int data_format) throw(std::runtime_error);
122 | 
123 |     cv::Mat normalize(const cv::Mat &mat, float mean, float scale);
124 | 
125 |     cv::Mat normalize(const cv::Mat &mat, const float mean[3], const float scale[3]);
126 | 
127 |     void normalize(const cv::Mat &inmat, cv::Mat &outmat, float mean, float scale);
128 | 
129 |     void normalize_inplace(cv::Mat &mat_inplace, float mean, float scale);
130 | 
131 |     void normalize_inplace(cv::Mat &mat_inplace, const float mean[3], const float scale[3]);
132 | 
133 | private:
134 |     const float means[3]  = {0.406*255, 0.485*255, 0.456*255}; // BGR
135 |     const float norms[3] = {1/(0.225*255), 1/(0.229*255), 1/(0.224*255)}; // BGR
136 |     float max_pred_score = -1.f;
137 |     float max_score_decay = 1.f;
138 | 
139 |     Ort::Value *x = nullptr;
140 |     Ort::Value *z = nullptr;
141 |     Ort::Value *oz = nullptr;
142 | 
143 |     cv::Mat z_patch; // template
144 |     cv::Mat oz_patch; // online_template
145 |     cv::Mat max_oz_patch; // online max template
146 | 
147 |     DrOBB object_box;
148 |     int frame_id = 0;
149 | };
150 | 
151 | #endif 
152 | 


--------------------------------------------------------------------------------
/mixformer_trt.cpp:
--------------------------------------------------------------------------------
  1 | // 
  2 | // Create by Daniel Lee on 2023/9/22
  3 | // 
  4 | #include <cstdlib>
  5 | #include <string>
  6 | #include <cmath>
  7 | #include "mixformer_trt.h"
  8 | 
  9 | #define TIME
 10 | #ifdef TIME
 11 | #include <sys/time.h>
 12 | #endif
 13 | 
 14 | #ifdef TIME
 15 |     struct timeval tv;
 16 |     uint64_t time_last;
 17 |     double time_ms;
 18 | #endif
 19 | 
 20 | MixformerTRT::MixformerTRT(std::string &engine_name)
 21 | {
 22 |     // deserialize engine
 23 |     this->deserialize_engine(engine_name);
 24 | 
 25 |     auto out_dims_0 = this->engine->getBindingDimensions(3);
 26 |     for(int j=0; j < out_dims_0.nbDims; j++)
 27 |     {
 28 |         this->output_pred_boxes_size *= out_dims_0.d[j];
 29 |     }
 30 | 
 31 |     auto out_dims_1 = this->engine->getBindingDimensions(4);
 32 |     for(int j=0; j < out_dims_1.nbDims; j++)
 33 |     {
 34 |         this->output_pred_scores_size *= out_dims_1.d[j];
 35 |     }
 36 | 
 37 |     this->output_pred_boxes = new float[this->output_pred_boxes_size];
 38 |     this->output_pred_scores = new float[this->output_pred_scores_size];
 39 | }
 40 | 
 41 | MixformerTRT::~MixformerTRT(){
 42 |     delete context;
 43 |     delete engine;
 44 |     delete runtime;
 45 |     delete[] trt_model_stream;
 46 |     delete[] this->output_pred_boxes;
 47 |     delete[] this->output_pred_scores;
 48 |     cudaStreamDestroy(stream);
 49 | }
 50 | 
 51 | void MixformerTRT::deserialize_engine(std::string &engine_name){
 52 |     // create a model using the API directly and serialize it to a stream
 53 |     // char *trt_model_stream{nullptr};
 54 |     std::ifstream file(engine_name, std::ios::binary);
 55 |     if (file.good())
 56 |     {  
 57 |         file.seekg(0, file.end);
 58 |         size = file.tellg();
 59 |         file.seekg(0, file.beg);
 60 |         this->trt_model_stream = new char[this->size];
 61 |         assert(this->trt_model_stream);
 62 |         file.read(trt_model_stream, this->size);
 63 |         file.close();
 64 |     }
 65 | 
 66 |     this->runtime = createInferRuntime(this->gLogger);    
 67 |     assert(this->runtime != nullptr);
 68 | 
 69 |     this->engine = this->runtime->deserializeCudaEngine(trt_model_stream,
 70 |                                                         this->size);    
 71 |     assert(this->engine != nullptr);
 72 | 
 73 |     this->context = this->engine->createExecutionContext();
 74 |     assert(context != nullptr);
 75 |     // delete[] trt_model_stream;
 76 | }
 77 | 
 78 | void MixformerTRT::infer(
 79 |     float *input_imt,
 80 |     float *input_imot,
 81 |     float *input_imsearch,
 82 |     float *output_pred_boxes,
 83 |     float *output_pred_scores,
 84 |     cv::Size input_imt_shape,
 85 |     cv::Size input_imot_shape,
 86 |     cv::Size input_imsearch_shape)
 87 | {
 88 |     assert(engine->getNbBindings() == 5);
 89 |     void* buffers[5];
 90 |     
 91 |     const int inputImgtIndex = engine->getBindingIndex(INPUT_BLOB_IMGT_NAME);
 92 |     // std::cout << ">>>debug infer start. " << (engine->getBindingDataType(inputImgtIndex) == nvinfer1::DataType::kFLOAT) << std::endl;
 93 |     assert(engine->getBindingDataType(inputImgtIndex) == nvinfer1::DataType::kFLOAT);
 94 |     const int inputImgotIndex = engine->getBindingIndex(INPUT_BLOB_IMGOT_NAME);
 95 |     assert(engine->getBindingDataType(inputImgotIndex) == nvinfer1::DataType::kFLOAT);
 96 |     const int inputImgsearchIndex = engine->getBindingIndex(INPUT_BLOB_IMGSEARCH_NAME);
 97 |     assert(engine->getBindingDataType(inputImgsearchIndex) == nvinfer1::DataType::kFLOAT);
 98 | 
 99 |     const int outputPredboxesIndex = engine->getBindingIndex(OUTPUT_BLOB_PREDBOXES_NAME);
100 |     assert(engine->getBindingDataType(outputPredboxesIndex) == nvinfer1::DataType::kFLOAT);
101 |     const int outputPredscoresIndex = engine->getBindingIndex(OUTPUT_BLOB_PREDSCORES_NAME);
102 |     assert(engine->getBindingDataType(outputPredscoresIndex) == nvinfer1::DataType::kFLOAT);
103 | 
104 |     int mBatchSize = engine->getMaxBatchSize();
105 |     
106 |     // create gpu buffer on devices
107 |     CHECK(cudaMalloc(&buffers[inputImgtIndex], 3 * input_imt_shape.height * input_imt_shape.width * sizeof(float)));
108 |     CHECK(cudaMalloc(&buffers[inputImgotIndex], 3 * input_imot_shape.height * input_imot_shape.width * sizeof(float)));
109 |     CHECK(cudaMalloc(&buffers[inputImgsearchIndex], 3 * input_imsearch_shape.height * input_imsearch_shape.width * sizeof(float)));
110 |     CHECK(cudaMalloc(&buffers[outputPredboxesIndex], this->output_pred_boxes_size * sizeof(float)));
111 |     CHECK(cudaMalloc(&buffers[outputPredscoresIndex], this->output_pred_scores_size * sizeof(float)));
112 |    
113 |     // create stream
114 |     CHECK(cudaStreamCreate(&stream));
115 |        
116 |     // DMA input batch  data to device, infer on the batch asynchronously,  and DMA output back to host
117 |     CHECK(cudaMemcpyAsync(buffers[inputImgtIndex], input_imt, 3 * input_imt_shape.height * input_imt_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
118 |     CHECK(cudaMemcpyAsync(buffers[inputImgotIndex], input_imot, 3 * input_imot_shape.height * input_imot_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
119 |     CHECK(cudaMemcpyAsync(buffers[inputImgsearchIndex], input_imsearch, 3 * input_imsearch_shape.height * input_imsearch_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
120 |     
121 |     // inference
122 |     context->enqueue(mBatchSize, buffers, stream, nullptr);
123 |     
124 |     CHECK(cudaMemcpyAsync(output_pred_boxes, buffers[outputPredboxesIndex], this->output_pred_boxes_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
125 |     CHECK(cudaMemcpyAsync(output_pred_scores, buffers[outputPredscoresIndex], this->output_pred_scores_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
126 |     cudaStreamSynchronize(stream);
127 |     
128 |     // release buffers
129 |     CHECK(cudaFree(buffers[inputImgtIndex]));
130 |     CHECK(cudaFree(buffers[inputImgotIndex]));
131 |     CHECK(cudaFree(buffers[inputImgsearchIndex]));
132 |     CHECK(cudaFree(buffers[outputPredboxesIndex]));
133 |     CHECK(cudaFree(buffers[outputPredscoresIndex]));
134 |     // std::cout << ">>>debug infer end. "  << std::endl;
135 | }
136 | 
137 | // put z and x into transform
138 | void  MixformerTRT::transform(cv::Mat &mat_z, cv::Mat &mat_oz, cv::Mat &mat_x)
139 | {
140 |     this->blob_from_image_half(mat_z, mat_oz, mat_x);
141 | }
142 | 
143 | void MixformerTRT::blob_from_image_half(cv::Mat& img, cv::Mat &imgot, cv::Mat &imgx) {
144 |     cv::Mat imt_t;
145 |     cv::Mat imot_t;
146 |     cv::Mat imx_t;
147 |     // cv::imshow("BGR", img);
148 |     // cv::waitKey(500);
149 |     cvtColor(img, imt_t, cv::COLOR_BGR2RGB);
150 |     // cv::imshow("RGB", imt_t);
151 |     // cv::waitKey(500);
152 |     cvtColor(imgot, imot_t, cv::COLOR_BGR2RGB);
153 |     cvtColor(imgx, imx_t, cv::COLOR_BGR2RGB);
154 | 
155 |     // 需及时释放
156 |     this->input_imt = new float[img.total() * 3]; // Use __fp16 data type for blob array
157 |     this->input_imot = new float[imgot.total() * 3]; // Use __fp16 data type for blob array
158 |     this->input_imsearch = new float[imgx.total() * 3]; // Use __fp16 data type for blob array
159 | 
160 |     half_norm(imt_t, this->input_imt);
161 |     half_norm(imot_t, this->input_imot);
162 |     half_norm(imx_t, this->input_imsearch);
163 | }
164 | 
165 | void MixformerTRT::half_norm(const cv::Mat &img, float* input_data)
166 | {
167 |     int channels = 3;
168 |     int img_h = img.rows;
169 |     int img_w = img.cols;
170 | 
171 |     cv::Mat img_cp;
172 |     img_cp = img.clone();
173 |     
174 |     for (size_t c = 0; c < channels; c++) {
175 |         for (size_t h = 0; h < img_h; h++) {
176 |             for (size_t w = 0; w < img_w; w++) {
177 |                 input_data[c * img_w * img_h + h * img_w + w] = 
178 |                     cv::saturate_cast<float>((((float)img_cp.at<cv::Vec3b>(h, w)[c]) - mean_vals[c]) * norm_vals[c]);
179 |             }
180 |         }
181 |     }
182 | 
183 | }
184 | 
185 | 
186 | void MixformerTRT::blob_from_image_half(cv::Mat& img, float* input_blob_half) {
187 |     int channels = 3;
188 |     int img_h = img.rows;
189 |     int img_w = img.cols;
190 |     for (size_t c = 0; c < channels; c++) {
191 |         for (size_t h = 0; h < img_h; h++) {
192 |             for (size_t w = 0; w < img_w; w++) {
193 |                 input_blob_half[c * img_w * img_h + h * img_w + w] = float(img.at<cv::Vec3b>(h, w)[c]);
194 |                     // cv::saturate_cast<half_float::half>((((float)img.at<cv::Vec3b>(h, w)[c]) / 255.0f - mean[c]) / std_var[c]);
195 |                     // cv::saturate_cast<half_float::half>((float)img.at<cv::Vec3b>(h, w)[c]);
196 |                 // std::cout << input_blob_half[c * img_w * img_h + h * img_w + w] << std::endl;
197 |             }
198 |         }
199 |     }  
200 | }
201 | 
202 | void MixformerTRT::init(const cv::Mat &img, DrOBB bbox)
203 | {
204 |     // get subwindow
205 |     cv::Mat zt_patch; 
206 |     cv::Mat ozt_patch;   
207 |     float resize_factor = 1.f;
208 |     this->sample_target(img, zt_patch, bbox.box, this->cfg.template_factor, this->cfg.template_size, resize_factor);
209 |     this->sample_target(img, ozt_patch, bbox.box, this->cfg.template_factor, this->cfg.template_size, resize_factor);
210 |     // cv::Mat oz_patch = z_patch.clone(); 
211 |     this->z_patch = zt_patch;
212 |     this->oz_patch = ozt_patch;
213 |     this->state = bbox.box;
214 | }
215 | 
216 | const DrOBB &MixformerTRT::track(const cv::Mat &img)
217 | {
218 |     // if (img.empty()) return;
219 |     // get subwindow
220 |     cv::Mat x_patch;
221 |     this->frame_id += 1;
222 |     float resize_factor = 1.f;
223 |     this->sample_target(img, x_patch, this->state, this->cfg.search_factor, this->cfg.search_size, resize_factor);
224 |     
225 |     // preprocess input tensor
226 |     this->transform(this->z_patch, this->oz_patch, x_patch);
227 | 
228 |     // inference score， size  and offsets
229 |     cv::Size input_imt_shape = this->z_patch.size();
230 |     cv::Size input_imot_shape = this->oz_patch.size();
231 |     cv::Size input_imsearch_shape = x_patch.size();
232 |     
233 |     this->infer(input_imt, input_imot, input_imsearch, 
234 |               output_pred_boxes, output_pred_scores, 
235 |               input_imt_shape, input_imot_shape, 
236 |               input_imsearch_shape);
237 |     
238 |     delete[] this->input_imt;
239 |     delete[] this->input_imot;
240 |     delete[] this->input_imsearch;
241 | 
242 |     DrBBox pred_box;
243 |     float pred_score;
244 | 
245 |     this->cal_bbox(output_pred_boxes, output_pred_scores, pred_box, pred_score, resize_factor);
246 |     
247 |     this->map_box_back(pred_box, resize_factor);
248 |     this->clip_box(pred_box, img.rows, img.cols, 10);
249 |     
250 |     object_box.box = pred_box;
251 |     object_box.class_id = 0;
252 |     object_box.score = pred_score;
253 | 
254 |     this->state = object_box.box;
255 | 
256 |     this->max_pred_score = this->max_pred_score * this->max_score_decay;
257 |     // update template
258 |     if (pred_score > 0.9 && pred_score > this->max_pred_score)
259 |     {
260 |       this->sample_target(img, this->max_oz_patch, this->state, this->cfg.template_factor, this->cfg.template_size, resize_factor);
261 |       this->max_pred_score = pred_score;
262 |     }
263 | 
264 |     if (this->frame_id % this->cfg.update_interval == 0)
265 |     {
266 |       this->oz_patch = this->max_oz_patch;
267 |       this->max_pred_score = -1.0;
268 |       this->max_oz_patch = this->oz_patch;
269 |     }
270 |     
271 |     return object_box;
272 | }
273 | 
274 | // calculate bbox
275 | void MixformerTRT::cal_bbox(float *boxes_ptr, float * scores_ptr, DrBBox &pred_box, float &max_score, float resize_factor) {
276 |     auto cx = boxes_ptr[0];
277 |     auto cy = boxes_ptr[1];
278 |     auto w = boxes_ptr[2];
279 |     auto h = boxes_ptr[3];
280 |     // std::cout << "cal_bbox cx cy w h "<< cx << " " << cy << " " << w << " " << h << std::endl;
281 |     cx = cx * this->cfg.search_size / resize_factor;
282 |     cy = cy * this->cfg.search_size / resize_factor;
283 |     w = w * this->cfg.search_size / resize_factor;
284 |     h = h * this->cfg.search_size / resize_factor;
285 |     
286 |     pred_box.x0 = cx - 0.5 * w;
287 |     pred_box.y0 = cy - 0.5 * h;
288 |     pred_box.x1 = pred_box.x0 + w;
289 |     pred_box.y1 = pred_box.y0 + h;
290 | 
291 |     max_score = scores_ptr[0];
292 | }
293 | 
294 | void MixformerTRT::map_box_back(DrBBox &pred_box, float resize_factor) {
295 |     float cx_prev = this->state.x0 + 0.5 * (this->state.x1 - this->state.x0);
296 |     float cy_prev = this->state.y0 + 0.5 * (this->state.y1 - this->state.y0);
297 | 
298 |     float half_side = 0.5 * this->cfg.search_size / resize_factor;
299 | 
300 |     float w = pred_box.x1 - pred_box.x0;
301 |     float h = pred_box.y1 - pred_box.y0;
302 |     float cx = pred_box.x0 + 0.5 * w;
303 |     float cy = pred_box.y0 + 0.5 * h;
304 | 
305 |     float cx_real = cx + (cx_prev - half_side);
306 |     float cy_real = cy + (cy_prev - half_side);
307 | 
308 |     pred_box.x0 = cx_real - 0.5 * w;
309 |     pred_box.y0 = cy_real - 0.5 * h;
310 |     pred_box.x1 = cx_real + 0.5 * w;
311 |     pred_box.y1 = cy_real + 0.5 * h;
312 | }
313 | 
314 | void MixformerTRT::clip_box(DrBBox &box, int height, int wight, int margin) {
315 |     box.x0 = std::min(std::max(0, int(box.x0)), wight - margin);
316 |     box.y0 = std::min(std::max(0, int(box.y0)), height - margin);
317 |     box.x1 = std::min(std::max(margin, int(box.x1)), wight);
318 |     box.y1 = std::min(std::max(margin, int(box.y1)), height);
319 | }
320 | 
321 | void MixformerTRT::sample_target(const cv::Mat &im, cv::Mat &croped, DrBBox target_bb, float search_area_factor, int output_sz, float &resize_factor) {
322 |     /* Extracts a square crop centrered at target_bb box, of are search_area_factor^2 times target_bb area
323 | 
324 |     args:
325 |         im: Img image
326 |         target_bb - target box [x0, y0, x1, y1]
327 |         search_area_factor - Ratio of crop size to target size
328 |         output_sz - Size
329 |     
330 |     */
331 |    int x = target_bb.x0;
332 |    int y = target_bb.y0;
333 |    int w = target_bb.x1 - target_bb.x0;
334 |    int h = target_bb.y1 - target_bb.y0;
335 |    int crop_sz = std::ceil(std::sqrt(w *h) * search_area_factor);
336 | 
337 |    float cx = x + 0.5 * w;
338 |    float cy = y + 0.5 * h;
339 |    int x1 = std::round(cx - crop_sz * 0.5);
340 |    int y1 = std::round(cy - crop_sz * 0.5);
341 | 
342 |    int x2 = x1 + crop_sz;
343 |    int y2 = y1 + crop_sz;
344 | 
345 |    int x1_pad = std::max(0, -x1);
346 |    int x2_pad = std::max(x2 - im.cols +1, 0);
347 |    
348 |    int y1_pad = std::max(0, -y1);
349 |    int y2_pad = std::max(y2- im.rows + 1, 0);
350 | 
351 |    // Crop target
352 |    cv::Rect roi_rect(x1+x1_pad, y1+y1_pad, (x2-x2_pad)-(x1+x1_pad), (y2-y2_pad)-(y1+y1_pad));
353 |    cv::Mat roi = im(roi_rect);
354 | 
355 |    // Pad
356 |    cv::copyMakeBorder(roi, croped, y1_pad, y2_pad, x1_pad, x2_pad, cv::BORDER_CONSTANT);
357 | 
358 |    // Resize
359 |    cv::resize(croped, croped, cv::Size(output_sz, output_sz));
360 | 
361 |    resize_factor = output_sz * 1.f / crop_sz;
362 | }
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 
370 | 
371 | 
372 | 


--------------------------------------------------------------------------------
/mixformer_trt.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Create by Daniel Lee on 2023/9/22
  3 | //
  4 | 
  5 | #ifndef MIXFORMER_TRT_H
  6 | #define MIXFORMER_TRT_H
  7 | 
  8 | #include <iostream>
  9 | #include <fstream>
 10 | #include <vector> 
 11 | #include <map>
 12 | #include <memory>
 13 | #include <assert.h>
 14 | #include "opencv2/opencv.hpp"
 15 | #include "opencv2/highgui/highgui.hpp"
 16 | #include "opencv2/imgproc/imgproc.hpp"
 17 | #include <half.hpp>
 18 | #include <dirent.h>
 19 | #include "NvInfer.h"
 20 | #include "cuda_runtime_api.h"
 21 | #include "cuda_fp16.h"
 22 | #include "logging.h"
 23 | 
 24 | #define CHECK(status) \
 25 |     do\
 26 |     {\
 27 |         auto ret = (status);\
 28 |         if (ret != 0)\
 29 |         {\
 30 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 31 |             abort();\
 32 |         }\
 33 |     } while (0)
 34 | 
 35 | #define DEVICE 0  // GPU id
 36 | 
 37 | # define USE_CUDA
 38 | 
 39 | using namespace nvinfer1;
 40 | 
 41 | struct DrBBox {
 42 |     float x0;
 43 |     float y0;
 44 |     float x1;
 45 |     float y1;
 46 | };
 47 | 
 48 | struct DrOBB {
 49 |     DrBBox box;
 50 |     float score;
 51 |     int class_id;
 52 | };
 53 | 
 54 | struct Config {
 55 |     // std::vector<float> window;
 56 |     float template_factor = 2.0;
 57 |     float search_factor = 4.5; // 5.0
 58 |     int template_size = 112; //192
 59 |     int search_size = 224; // 384
 60 |     int stride = 16;
 61 |     int feat_sz = 14; // 24
 62 |     int update_interval = 200;
 63 | };
 64 | enum
 65 | {
 66 |     CHW = 0, HWC = 1
 67 | };
 68 | 
 69 | 
 70 | class MixformerTRT {
 71 | 
 72 | private:
 73 |     Logger gLogger;
 74 | 
 75 |     const char* INPUT_BLOB_IMGT_NAME = "img_t";
 76 |     const char* INPUT_BLOB_IMGOT_NAME = "img_ot";
 77 |     const char* INPUT_BLOB_IMGSEARCH_NAME = "img_search";
 78 | 
 79 |     const char* OUTPUT_BLOB_PREDBOXES_NAME = "pred_boxes"; 
 80 |     const char* OUTPUT_BLOB_PREDSCORES_NAME = "pred_scores"; 
 81 | 
 82 |     char *trt_model_stream = nullptr;
 83 |     
 84 |     size_t size{0};
 85 | 
 86 |     // define the TensorRT runtime, engine, context,stream
 87 |     IRuntime *runtime = nullptr;
 88 |     ICudaEngine *engine = nullptr;
 89 |     IExecutionContext *context = nullptr;
 90 | 
 91 |     cudaStream_t stream;
 92 | 
 93 |     // init dynamic input dims
 94 |     std::vector<std::vector<int64_t>> input_node_dims = {
 95 |         {1, 3, 112, 112}, // z  (b=1,c,h,w)
 96 |         {1, 3, 112, 112}, // z  (b=1,c,h,w)
 97 |         {1, 3, 224, 224} // x
 98 |     };
 99 | 
100 |     // Define FP32 mean and scale values
101 |     const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};  // RGB
102 |     const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
103 | 
104 |     float max_pred_score = -1.f;
105 |     float max_score_decay = 1.f;
106 | 
107 |     cv::Mat z_patch; // template
108 |     cv::Mat oz_patch; // online_template
109 |     cv::Mat max_oz_patch; // online max template
110 | 
111 |     DrOBB object_box;
112 |     int frame_id = 0;
113 | 
114 |     int output_pred_boxes_size = 1;
115 |     int output_pred_scores_size = 1;
116 | 
117 | 
118 |     float *input_imt = nullptr;
119 |     float *input_imot = nullptr;
120 |     float *input_imsearch = nullptr;
121 |     float *output_pred_boxes = nullptr;
122 |     float *output_pred_scores = nullptr;
123 | 
124 |     // 数据尺度的定义
125 |     static const int INPUT_IMT_W = 112;
126 |     static const int INPUT_IMOT_W = 112;
127 |     static const int INPUT_IMSEARCH_W = 224;
128 | 
129 |     static const int INPUT_IMT_H = 112;
130 |     static const int INPUT_IMOT_H = 112;
131 |     static const int INPUT_IMSEARCH_H = 224;
132 | 
133 | private:
134 | 
135 |     void transform(cv::Mat &mat_z, cv::Mat &mat_oz, cv::Mat &mat_x);
136 | 
137 |     void map_box_back(DrBBox &pred_box, float resize_factor);
138 | 
139 |     void clip_box(DrBBox &box, int height, int wight, int margin);
140 | 
141 |     void cal_bbox(float *boxes_ptr, float * scores_ptr, DrBBox &pred_box, float &max_score, float resize_factor);
142 | 
143 |     void sample_target(const cv::Mat &im, cv::Mat &croped, DrBBox target_bb, float search_area_factor, int output_sz, float &resize_factor);
144 | 
145 | public:
146 | 
147 |     MixformerTRT(std::string &engine_name);
148 | 
149 |     ~MixformerTRT(); //override
150 | 
151 |     void init(const cv::Mat &img, DrOBB bbox);    
152 |     
153 |     const DrOBB &track(const cv::Mat &img);
154 |     
155 |     // state  dynamic
156 |     DrBBox state;
157 |     
158 |     // config static
159 |     Config cfg; 
160 | 
161 |     void deserialize_engine(std::string &engine_name);
162 | 
163 |     void infer(
164 |         float  *input_imt,
165 |         float  *input_imot,
166 |         float  *input_imsearch,
167 |         float  *output_pred_boxes,
168 |         float  *output_pred_scores,
169 |         cv::Size input_imt_shape,
170 |         cv::Size input_imot_shape,
171 |         cv::Size input_imsearch_shape);
172 |     
173 |     void blob_from_image_half(cv::Mat& img, float* output_data);
174 | 
175 |     void blob_from_image_half(cv::Mat& img, cv::Mat &imgot, cv::Mat &imgx);
176 |     
177 |     void half_norm(const cv::Mat &img, float* input_data);
178 | 
179 |     
180 | };
181 | 
182 | #endif 
183 | 


--------------------------------------------------------------------------------
/test/test_client.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import json
 3 | 
 4 | if __name__=="__main__":
 5 |     id = 0
 6 |     while True:
 7 |         # 创建UDP套接字
 8 |         sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 9 | 
10 |         # 目标服务器的IP和端口
11 |         server_address = ('localhost', 12345)
12 | 
13 |         # 要发送的JSON数据
14 |         data = {
15 |             'name': 'Alice',
16 |             'age': id
17 |         }
18 |         id += 1
19 |         message = json.dumps(data).encode('utf-8')
20 |         
21 |         # 发送数据
22 |         sock.sendto(message, server_address)
23 |         sock.close()


--------------------------------------------------------------------------------
/test/test_server.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import json
 3 | 
 4 | # 创建UDP套接字
 5 | sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 6 | server_address = ('localhost', 12345)
 7 | sock.bind(server_address)
 8 | 
 9 | while True:
10 |     # 等待数据
11 |     data, address = sock.recvfrom(4096)
12 |     
13 |     # 解析JSON数据
14 |     received_data = json.loads(data.decode('utf-8'))
15 |     print("Received data:", received_data)
16 | 
17 |     # 如果你不想无限期地等待，可以加入退出条件
18 |     if received_data.get("exit"):
19 |         break
20 | 
21 | sock.close()


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | import torch
 4 | 
 5 | def cxy_wh_2_rect(pos, sz):
 6 |     return [float(max(float(0), pos[0] - sz[0] / 2)), float(max(float(0), pos[1] - sz[1] / 2)), float(sz[0]),
 7 |             float(sz[1])]  # 0-index
 8 | 
 9 | 
10 | def hann1d(sz: int, centered = True) -> np.ndarray:
11 |     """1D cosine window."""
12 |     if centered:
13 |         return 0.5 * (1 - np.cos((2 * math.pi / (sz + 1)) * np.arange(1, sz + 1).astype(np.float64)))
14 |     w = 0.5 * (1 + np.cos((2 * math.pi / (sz + 2)) * np.arange(0, sz//2 + 1).astype(np.float64)))
15 |     return np.concatenate([w, np.flip(w[1:sz-sz//2], (0,))])
16 | 
17 | 
18 | def hann2d(sz: np.ndarray, centered = True) -> np.ndarray:
19 |     """2D cosine window."""
20 |     return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(sz[1].item(), centered).reshape(1, 1, 1, -1)
21 |     
22 | # def hann1d(sz: int, centered = True) -> np.ndarray:
23 | #     """1D cosine window."""
24 | #     if centered:
25 | #         return 0.5 * (1 - torch.cos((2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
26 | #     w = 0.5 * (1 + torch.cos((2 * math.pi / (sz + 2)) * torch.arange(0, sz//2 + 1).float()))
27 | #     return torch.cat([w, w[1:sz-sz//2].flip((0,))]).numpy()
28 | 
29 | 
30 | # def hann2d(sz: np.ndarray, centered = True) -> np.ndarray:
31 | #     """2D cosine window."""
32 | #     sz = torch.from_numpy(sz)
33 | #     ret = hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(sz[1].item(), centered).reshape(1, 1, 1, -1)
34 | #     return ret.numpy()
35 | 
36 | def img2tensor(img):
37 |     img = img[..., ::-1]  # BGR2RGB
38 |     img = img - (0.485*255, 0.456*255, 0.406*255)
39 |     img = img * (1/(0.229*255), 1/(0.224*255), 1/(0.225*255))
40 |     img = np.transpose(img, (2, 0, 1)) # (H, W, 3) -> (3, H, W)
41 |     img = np.expand_dims(img, axis=0) # (3, H, W) -> (1, 3, H, W)
42 | 
43 |     return img


--------------------------------------------------------------------------------