├── .gitignore ├── simple-sgm ├── CMakeLists.txt ├── apps │ ├── resources │ │ ├── imLeft.png │ │ └── imRight.png │ ├── CMakeLists.txt │ ├── simple-sgm.cpp │ └── utils.h └── sgm │ ├── CMakeLists.txt │ └── include │ └── sgm │ ├── sgm_utils.h │ └── sgm.h ├── README.md ├── conanfile.py ├── CMakeLists.txt ├── LICENSE └── .clang-format /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /simple-sgm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(sgm) 2 | 3 | if(BUILD_APPS) 4 | add_subdirectory(apps) 5 | endif() 6 | 7 | -------------------------------------------------------------------------------- /simple-sgm/apps/resources/imLeft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gipeto/simple-sgm/HEAD/simple-sgm/apps/resources/imLeft.png -------------------------------------------------------------------------------- /simple-sgm/apps/resources/imRight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gipeto/simple-sgm/HEAD/simple-sgm/apps/resources/imRight.png -------------------------------------------------------------------------------- /simple-sgm/apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable(simple-sgm simple-sgm.cpp utils.h) 3 | target_link_libraries(simple-sgm sgm::sgm stb::stb) 4 | 5 | install(TARGETS simple-sgm 6 | DESTINATION ${CMAKE_INSTALL_BINDIRs}) 7 | -------------------------------------------------------------------------------- /simple-sgm/sgm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_library(sgm INTERFACE) 3 | 4 | target_include_directories(sgm INTERFACE $ "${CMAKE_CURRENT_BINARY_DIR}") 5 | 6 | install(TARGETS sgm 7 | DESTINATION ${CMAKE_INSTALL_BINDIRs}) 8 | 9 | add_library(sgm::sgm ALIAS sgm) 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # simple-sgm 2 | Implementation of a simplified version of the SemiGlobal Matching algorithm (Hirschmuller, H. (2005). Accurate and Efficient Stereo Processing by Semi Global Matching and Mutual Information. CVPR .) 3 | 4 | ## How to build 5 | 6 | ``` 7 | #install conan, needed only once 8 | pip install conan 9 | 10 | #from the repo root 11 | conan install -if build --build missing . 12 | cd build 13 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -G "Ninja" .. 14 | ninja install 15 | 16 | ``` -------------------------------------------------------------------------------- /conanfile.py: -------------------------------------------------------------------------------- 1 | 2 | from conans import ConanFile, CMake, tools 3 | 4 | 5 | class SimpleSGM(ConanFile): 6 | settings = "os", "compiler", "build_type", "arch" 7 | generators = "cmake_find_package" 8 | options = {"build_apps": [True, False]} 9 | default_options = {"build_apps": True} 10 | 11 | def requirements(self): 12 | if self.options.build_apps: 13 | self.requires('stb/20190512@conan/stable') 14 | 15 | def imports(self): 16 | self.copy("*.dll", dst="bin", src="bin") 17 | self.copy("*.dylib*", dst="lib", src="lib") 18 | self.copy('*.so*', dst='lib', src='lib') 19 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | project(simple-sgm VERSION 0.1.0 LANGUAGES C CXX) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 6 | set(CMAKE_CXX_EXTENSIONS OFF) 7 | 8 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 9 | 10 | option(BUILD_APPS "Build test applications" ON) 11 | 12 | set(CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}) 13 | 14 | if(APPLE) 15 | set(basePoint @loader_path) 16 | else() 17 | set(basePoint $ORIGIN) 18 | endif() 19 | 20 | include(GNUInstallDirs) 21 | file(RELATIVE_PATH relDir 22 | ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR} 23 | ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}) 24 | 25 | set(CMAKE_INSTALL_RPATH ${basePoint} ${basePoint}/${relDir}) 26 | 27 | # Find required packages 28 | if(BUILD_APPS) 29 | find_package(stb REQUIRED) 30 | endif() 31 | 32 | include(GenerateExportHeader) 33 | add_subdirectory(simple-sgm) 34 | 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Javier Perez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /simple-sgm/sgm/include/sgm/sgm_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #if defined(_MSC_VER) 8 | #include 9 | #define __avx2_dispatch 10 | #else 11 | #include 12 | #define __avx2_dispatch __attribute__((__target__("avx2"))) 13 | #endif 14 | 15 | namespace sgm 16 | { 17 | template 18 | struct aligned_deleter 19 | { 20 | void operator()(T* p) const 21 | { 22 | _mm_free(p); 23 | p = nullptr; 24 | } 25 | }; 26 | 27 | template 28 | using unique_ptr_aligned = typename std::unique_ptr>; 29 | 30 | template 31 | auto make_unique_aligned(size_t n) -> std::enable_if_t::value, unique_ptr_aligned> 32 | { 33 | auto p = _mm_malloc(n * sizeof(T), Alignment); 34 | 35 | if (nullptr == p) 36 | { 37 | throw std::runtime_error("Failed to allocate aligned memory"); 38 | } 39 | std::memset(p, 0, n * sizeof(T)); 40 | 41 | return unique_ptr_aligned(static_cast(p)); 42 | } 43 | 44 | template 45 | inline T Min(T const& left, T const& right) 46 | { 47 | return left < right ? left : right; 48 | } 49 | 50 | struct SimpleImage 51 | { 52 | unique_ptr_aligned Buffer; 53 | size_t Width; 54 | size_t Height; 55 | 56 | bool operator==(const SimpleImage& Other) const noexcept 57 | { 58 | return (Width == Other.Width) && (Height == Other.Height); 59 | } 60 | 61 | bool operator!=(const SimpleImage& Other) const noexcept 62 | { 63 | return !this->operator==(Other); 64 | } 65 | 66 | explicit operator bool() const noexcept 67 | { 68 | return nullptr != Buffer; 69 | } 70 | }; 71 | 72 | } // namespace sgm -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -4 4 | AlignAfterOpenBracket: true 5 | AlignEscapedNewlinesLeft: false 6 | AlignOperands: true 7 | AlignTrailingComments: true 8 | AllowAllParametersOfDeclarationOnNextLine: true 9 | AllowShortBlocksOnASingleLine: false 10 | AllowShortCaseLabelsOnASingleLine: false 11 | AllowShortFunctionsOnASingleLine: None 12 | AllowShortIfStatementsOnASingleLine: false 13 | AllowShortLoopsOnASingleLine: false 14 | AlwaysBreakAfterDefinitionReturnType: false 15 | AlwaysBreakBeforeMultilineStrings: false 16 | AlwaysBreakTemplateDeclarations: true 17 | BinPackArguments: true 18 | BinPackParameters: true 19 | BreakBeforeBinaryOperators: NonAssignment 20 | BreakBeforeBraces: Allman 21 | BreakBeforeTernaryOperators: true 22 | BreakConstructorInitializersBeforeComma: true 23 | ColumnLimit: 120 24 | CommentPragmas: '' 25 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 26 | ConstructorInitializerIndentWidth: 6 27 | ContinuationIndentWidth: 4 28 | Cpp11BracedListStyle: true 29 | DerivePointerAlignment: false 30 | DisableFormat: false 31 | ExperimentalAutoDetectBinPacking: false 32 | ForEachMacros: [] 33 | IndentCaseLabels: false 34 | IndentWidth: 4 35 | IndentWrappedFunctionNames: false 36 | KeepEmptyLinesAtTheStartOfBlocks: false 37 | MaxEmptyLinesToKeep: 1 38 | NamespaceIndentation: None 39 | FixNamespaceComments: true 40 | ObjCBlockIndentWidth: 2 41 | ObjCSpaceAfterProperty: false 42 | ObjCSpaceBeforeProtocolList: false 43 | PenaltyBreakBeforeFirstCallParameter: 1 44 | PenaltyBreakComment: 300 45 | PenaltyBreakFirstLessLess: 120 46 | PenaltyBreakString: 1000 47 | PenaltyExcessCharacter: 1000000 48 | PenaltyReturnTypeOnItsOwnLine: 200 49 | PointerAlignment: Left 50 | SpaceAfterCStyleCast: false 51 | SpaceBeforeAssignmentOperators: true 52 | SpaceBeforeParens: ControlStatements 53 | SpaceInEmptyParentheses: false 54 | SpacesBeforeTrailingComments: 2 55 | SpacesInAngles: false 56 | SpacesInContainerLiterals: true 57 | SpacesInCStyleCastParentheses: false 58 | SpacesInParentheses: false 59 | SpacesInSquareBrackets: false 60 | Standard: Cpp11 61 | TabWidth: 4 62 | UseTab: Never 63 | ... 64 | -------------------------------------------------------------------------------- /simple-sgm/apps/simple-sgm.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include 3 | 4 | namespace 5 | { 6 | static constexpr int S_OK = 0; 7 | static constexpr int S_FAIL = -1; 8 | 9 | auto static constexpr DMin = 0; 10 | auto static constexpr DMax = 64; 11 | 12 | } // namespace 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | if (argc != 4) 17 | { 18 | std::cout << std::endl; 19 | std::cout << "Computes a disparity map for the input left and right images" << std::endl; 20 | std::cout << "The output image is saved as png" << std::endl; 21 | std::cout << "Usage: simple-sgm " << std::endl; 22 | std::cout << std::endl; 23 | return S_OK; 24 | } 25 | 26 | std::cout << "Left image: " << argv[1] << std::endl; 27 | std::cout << "Right image: " << argv[2] << std::endl; 28 | std::cout << "Output image path: " << argv[3] << std::endl; 29 | 30 | try 31 | { 32 | auto LeftImage = utils::io::readImage(argv[1]); 33 | auto RightImage = utils::io::readImage(argv[2]); 34 | 35 | if (LeftImage != RightImage) 36 | { 37 | throw std::runtime_error("Images must have the same dimension"); 38 | } 39 | 40 | sgm::SimpleImage DMap; 41 | if (utils::instructionset::avx2_supported()) 42 | { 43 | utils::perf::PerformanceTimer timer("AVX2 accelerated sgm"); 44 | sgm::SemiGlobalMatching Sgm(std::move(LeftImage), std::move(RightImage)); 45 | Sgm.SetPenalities(10, 80); 46 | DMap = Sgm.GetDisparity(); 47 | } 48 | else 49 | { 50 | utils::perf::PerformanceTimer timer("Non vectorized sgm"); 51 | sgm::SemiGlobalMatching Sgm(std::move(LeftImage), std::move(RightImage)); 52 | Sgm.SetPenalities(10, 80); 53 | DMap = Sgm.GetDisparity(); 54 | } 55 | 56 | utils::io::saveImage(argv[3], DMap); 57 | 58 | return S_OK; 59 | } 60 | catch (const std::exception& e) 61 | { 62 | std::cerr << "Error: " << e.what() << std::endl; 63 | return S_FAIL; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /simple-sgm/apps/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define STB_IMAGE_IMPLEMENTATION 6 | #include 7 | #define STB_IMAGE_WRITE_IMPLEMENTATION 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace utils 17 | { 18 | namespace instructionset 19 | { 20 | void run_cpuid(int eax, int ecx, int* cpuInfo) 21 | { 22 | #if defined(_MSC_VER) 23 | __cpuidex(cpuInfo, eax, ecx); 24 | 25 | #else 26 | int ebx = 0; 27 | int edx = 0; 28 | 29 | #if defined(__i386__) && defined(__PIC__) 30 | 31 | /* in case of PIC under 32-bit EBX cannot be clobbered */ 32 | __asm__("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" 33 | : "=D"(ebx), 34 | #else 35 | __asm__("cpuid" 36 | : "+b"(ebx), 37 | #endif 38 | "+a"(eax), "+c"(ecx), "=d"(edx)); 39 | cpuInfo[0] = eax; 40 | cpuInfo[1] = ebx; 41 | cpuInfo[2] = ecx; 42 | cpuInfo[3] = edx; 43 | #endif 44 | } 45 | 46 | bool avx2_supported() 47 | { 48 | const int AVX_2_SUPPORTED = (1 << 5) | (1 << 3) | (1 << 8); 49 | int abcd[4]; 50 | run_cpuid(7, 0, abcd); 51 | return (abcd[1] & AVX_2_SUPPORTED) == AVX_2_SUPPORTED; 52 | } 53 | } // namespace instructionset 54 | 55 | namespace io 56 | { 57 | sgm::SimpleImage readImage(std::string filename) 58 | { 59 | int width{}; 60 | int height{}; 61 | int channels{}; 62 | const int desiredChannels = 1; 63 | auto* pixelData = stbi_load(filename.c_str(), &width, &height, &channels, desiredChannels); 64 | 65 | if (0 == pixelData) 66 | { 67 | throw std::runtime_error("Failed to load " + filename); 68 | } 69 | 70 | sgm::SimpleImage image{sgm::make_unique_aligned(static_cast(width * height)), 71 | static_cast(width), static_cast(height)}; 72 | 73 | std::memcpy(&image.Buffer.get()[0], pixelData, static_cast(width * height)); 74 | stbi_image_free(pixelData); 75 | 76 | return image; 77 | } 78 | 79 | void saveImage(std::string filename, const sgm::SimpleImage& image) 80 | { 81 | if (0 82 | == stbi_write_png(filename.c_str(), static_cast(image.Width), static_cast(image.Height), 1, 83 | image.Buffer.get(), static_cast(image.Width))) 84 | { 85 | throw std::runtime_error("Failed to write " + filename); 86 | } 87 | } 88 | 89 | } // namespace io 90 | 91 | namespace perf 92 | { 93 | class PerformanceTimer 94 | { 95 | std::string m_message; 96 | std::chrono::high_resolution_clock::time_point m_start; 97 | 98 | public: 99 | PerformanceTimer(std::string message) 100 | : m_message(std::move(message)) 101 | , m_start(std::chrono::high_resolution_clock::now()) 102 | { 103 | } 104 | 105 | ~PerformanceTimer() 106 | { 107 | const auto stop = std::chrono::high_resolution_clock::now(); 108 | std::chrono::duration diff = stop - m_start; 109 | 110 | std::cout << std::endl; 111 | std::cout << "-------------------------------------------" << std::endl; 112 | std::cout << m_message << " did execute in " << 1000. * diff.count() << " ms" << std::endl; 113 | std::cout << "-------------------------------------------" << std::endl; 114 | } 115 | }; 116 | 117 | } // namespace perf 118 | 119 | } // namespace utils -------------------------------------------------------------------------------- /simple-sgm/sgm/include/sgm/sgm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace sgm 7 | { 8 | /* 9 | Implementation of the SemiGlobal Matching algorithm [1], with the following limitations: 10 | 11 | - The matching cost is a simple absolute difference of the pixel luminance 12 | - Only 8 path are considered 13 | - Penality P2 are not weighted by the image gradient 14 | - No consistency checks 15 | 16 | [1] Hirschmuller, H. (2005). Accurate and Efficient Stereo Processing by Semi Global Matching and Mutual Information. 17 | CVPR . 18 | 19 | */ 20 | template 21 | class SemiGlobalMatching 22 | { 23 | using T = unsigned short; 24 | auto static constexpr DInt = DMax - DMin; 25 | 26 | static_assert(DMin >= 0 && DMax >= 0, "DMin and DMax must be positive"); 27 | static_assert(DMax > DMin, "DMax must be larger than DMin"); 28 | static_assert(0 == DMin % 16, "DMin must be a multiple of 16"); 29 | static_assert(0 == DMax % 16, "DMax must be a multiple of 16"); 30 | 31 | template 32 | struct Loop 33 | { 34 | inline static void EvaluateMin(T* Lmin, T& GlobalMin, T* Lp, T P1) noexcept 35 | { 36 | GlobalMin = Min(GlobalMin, Lp[cnt]); 37 | 38 | if (0 == cnt) 39 | { 40 | Lmin[cnt] = Min(Lp[cnt], static_cast(Lp[cnt + 1] + P1)); 41 | Loop::EvaluateMin(Lmin, GlobalMin, Lp, P1); 42 | return; 43 | } 44 | 45 | if (N - 1 == cnt) 46 | { 47 | Lmin[cnt] = Min(static_cast(Lp[cnt - 1] + P1), Lp[cnt]); 48 | Loop::EvaluateMin(Lmin, GlobalMin, Lp, P1); 49 | return; 50 | } 51 | 52 | Lmin[cnt] = Min(static_cast(Min(Lp[cnt - 1], Lp[cnt + 1]) + P1), Lp[cnt]); 53 | Loop::EvaluateMin(Lmin, GlobalMin, Lp, P1); 54 | } 55 | 56 | inline static T GetMinIdx(T& GlobalMin, T* Lp, int d) noexcept 57 | { 58 | if (Lp[cnt] < GlobalMin) 59 | { 60 | d = cnt; 61 | GlobalMin = Lp[cnt]; 62 | } 63 | 64 | return Loop::GetMinIdx(GlobalMin, Lp, d); 65 | } 66 | 67 | __avx2_dispatch inline static void EvaluateMinAVX2(T* Lmin, __m256i& GlobalMin, T* Lp, __m256i& P1) noexcept 68 | { 69 | auto _Lp = _mm256_load_si256(reinterpret_cast<__m256i*>(Lp)); 70 | auto _Lp_plus = _mm256_lddqu_si256(reinterpret_cast<__m256i*>(Lp + 1)); 71 | 72 | GlobalMin = _mm256_min_epu16(GlobalMin, _Lp); 73 | 74 | if (0 == cnt) 75 | { 76 | auto _min = _mm256_min_epu16(_Lp, _mm256_adds_epu16(_Lp_plus, P1)); 77 | _mm256_store_si256(reinterpret_cast<__m256i*>(Lmin), _min); 78 | Loop::EvaluateMinAVX2(Lmin + 16, GlobalMin, Lp + 16, P1); 79 | return; 80 | } 81 | 82 | auto _Lp_minus = _mm256_lddqu_si256(reinterpret_cast<__m256i*>(Lp - 1)); 83 | 84 | if (N - 16 == cnt) 85 | { 86 | auto _min = _mm256_min_epu16(_Lp, _mm256_adds_epu16(_Lp_minus, P1)); 87 | _mm256_store_si256(reinterpret_cast<__m256i*>(Lmin), _min); 88 | Loop::EvaluateMinAVX2(Lmin + 16, GlobalMin, Lp + 16, P1); 89 | return; 90 | } 91 | 92 | auto _min = _mm256_min_epu16(_Lp, _mm256_adds_epu16(_mm256_min_epu16(_Lp_minus, _Lp_plus), P1)); 93 | _mm256_store_si256(reinterpret_cast<__m256i*>(Lmin), _min); 94 | Loop::EvaluateMinAVX2(Lmin + 16, GlobalMin, Lp + 16, P1); 95 | } 96 | }; 97 | 98 | template 99 | struct Loop 100 | { 101 | inline static void EvaluateMin(T*, T&, T*, T) noexcept 102 | { 103 | } 104 | 105 | inline static T GetMinIdx(T&, T*, int d) noexcept 106 | { 107 | return d; 108 | } 109 | 110 | __avx2_dispatch inline static void EvaluateMinAVX2(T*, __m256i&, T*, __m256i&) noexcept 111 | { 112 | } 113 | }; 114 | 115 | using BufferPtr = unique_ptr_aligned; 116 | auto static constexpr Alignment = 32; 117 | 118 | BufferPtr C; 119 | BufferPtr S; 120 | 121 | BufferPtr PathStorage[4]; 122 | 123 | BufferPtr min_Lp_r; 124 | 125 | size_t Width; 126 | size_t Height; 127 | 128 | SimpleImage Left; 129 | SimpleImage Right; 130 | 131 | T m_P1 = 5; 132 | T m_P2 = 30; 133 | 134 | public: 135 | SemiGlobalMatching(SimpleImage&& _Left, SimpleImage&& _Right) 136 | : Left(std::move(_Left)) 137 | , Right(std::move(_Right)) 138 | { 139 | Width = Left.Width; 140 | Height = Left.Height; 141 | 142 | C = make_unique_aligned(Width * Height * DInt); 143 | S = make_unique_aligned(Width * Height * DInt); 144 | PathStorage[0] = make_unique_aligned(DInt); 145 | PathStorage[1] = make_unique_aligned(Width * DInt); 146 | PathStorage[2] = make_unique_aligned(Width * DInt); 147 | PathStorage[3] = make_unique_aligned(Width * DInt); 148 | min_Lp_r = make_unique_aligned(DInt); 149 | ComputeCost(); 150 | } 151 | 152 | inline void SetPenalities(T P1, T P2) 153 | { 154 | m_P1 = P1; 155 | m_P2 = P2; 156 | } 157 | 158 | inline void ComputeCost() 159 | { 160 | for (auto iy = 0; iy < Height; iy++) 161 | { 162 | for (auto ix = 0; ix < DMin; ix++) 163 | { 164 | auto iidx = ix + Width * iy; 165 | 166 | for (auto d = 0; d < DInt; d++) 167 | { 168 | auto idx = d + iidx * DInt; 169 | C[idx] = (1 << 11); 170 | assert(idx >= 0 && idx < Width * Height * DInt); 171 | } 172 | } 173 | 174 | for (auto ix = DMin; ix < DMax; ix++) 175 | { 176 | auto iidx = ix + Width * iy; 177 | 178 | for (auto d = DMin; d < ix; d++) 179 | { 180 | auto idx = d - DMin + iidx * DInt; 181 | C[idx] = abs(static_cast(Left.Buffer[iidx]) - static_cast(Right.Buffer[iidx - d])); 182 | assert(idx >= 0 && idx < Width * Height * DInt); 183 | assert(iidx - d >= 0 && iidx - d < Width * Height); 184 | } 185 | 186 | for (auto d = ix; d < DMax; d++) 187 | { 188 | auto idx = d - DMin + iidx * DInt; 189 | C[idx] = (1 << 11); 190 | assert(idx >= 0 && idx < Width * Height * DInt); 191 | } 192 | } 193 | 194 | for (auto ix = DMax; ix < Width; ix++) 195 | { 196 | auto iidx = ix + Width * iy; 197 | 198 | for (auto d = 0; d < DInt; d++) 199 | { 200 | auto idx = d + iidx * DInt; 201 | C[idx] = 202 | abs(static_cast(Left.Buffer[iidx]) - static_cast(Right.Buffer[iidx - d - DMin])); 203 | assert(idx >= 0 && idx < Width * Height * DInt); 204 | assert(iidx - d - DMin >= 0 && iidx - d - DMin < Width * Height); 205 | } 206 | } 207 | } 208 | } 209 | 210 | SimpleImage GetDisparity() 211 | { 212 | ForwardPass(); 213 | BackwardPass(); 214 | 215 | auto Disparity = make_unique_aligned(Width * Height); 216 | auto Output = make_unique_aligned(Width * Height); 217 | 218 | T MaxDisparity = DMin; 219 | T MinDisparity = DMax; 220 | 221 | for (auto i = 0; i < Height * Width; i++) 222 | { 223 | auto MinVal = std::numeric_limits::max(); 224 | auto d = Loop<0, DInt>::GetMinIdx(MinVal, S.get() + i * DInt, 0); 225 | 226 | if (d > MaxDisparity) 227 | MaxDisparity = d; 228 | if (d < MinDisparity) 229 | MinDisparity = d; 230 | 231 | Disparity[i] = d; 232 | } 233 | 234 | for (auto i = 0; i < Height * Width; i++) 235 | { 236 | Output[i] = (Disparity[i] - MinDisparity) * 255 / (MaxDisparity - MinDisparity); 237 | } 238 | 239 | return {std::move(Output), Width, Height}; 240 | } 241 | 242 | private: 243 | __avx2_dispatch inline void EvaluateMinAVX2Proxy(T* Lmin, T& GlobalMin, T* Lp, T P1) noexcept 244 | { 245 | __m256i _GlobalMin = _mm256_set1_epi16(GlobalMin); 246 | __m256i _P1 = _mm256_set1_epi16(P1); 247 | 248 | Loop<0, (DInt >> 4)>::EvaluateMinAVX2(Lmin, _GlobalMin, Lp, _P1); 249 | 250 | alignas(32) T LastMin[16]; 251 | _mm256_store_si256(reinterpret_cast<__m256i*>(&LastMin), _GlobalMin); 252 | 253 | for (auto& i : LastMin) 254 | { 255 | if (i < GlobalMin) 256 | { 257 | GlobalMin = i; 258 | } 259 | } 260 | } 261 | 262 | template 263 | __avx2_dispatch inline void UpdatePath(T* pS, T* const pC, size_t widx) noexcept 264 | { 265 | auto pshift = (0 == P) ? 0 : widx * DInt; 266 | auto path_vector = PathStorage[P].get() + pshift; 267 | 268 | if (init) 269 | { 270 | for (auto d = 0; d < DInt; d++) 271 | { 272 | auto path_cost = pC[d]; 273 | pS[d] += path_cost; 274 | path_vector[d] = path_cost; 275 | } 276 | 277 | return; 278 | } 279 | 280 | T LGmin = std::numeric_limits::max(); 281 | 282 | if (WithAVX2) 283 | { 284 | EvaluateMinAVX2Proxy(min_Lp_r.get(), LGmin, path_vector, m_P1); 285 | 286 | auto _P2 = _mm256_set1_epi16(m_P2); 287 | auto _LGmin = _mm256_set1_epi16(LGmin); 288 | auto _Lp_r_far = _mm256_adds_epi16(_P2, _LGmin); 289 | 290 | auto pCtmp = pC; 291 | auto min_Lp_r_tmp = min_Lp_r.get(); 292 | auto path_vector_tmp = path_vector; 293 | auto pStmp = pS; 294 | 295 | for (auto d = 0; d < (DInt >> 4); d++) 296 | { 297 | auto _pC = _mm256_load_si256(reinterpret_cast<__m256i*>(pCtmp)); 298 | auto _min_Lp_r = _mm256_load_si256(reinterpret_cast<__m256i*>(min_Lp_r_tmp)); 299 | auto _path_vector = _mm256_load_si256(reinterpret_cast<__m256i*>(path_vector_tmp)); 300 | auto _pS = _mm256_load_si256(reinterpret_cast<__m256i*>(pStmp)); 301 | 302 | auto _path_cost = 303 | _mm256_subs_epu16(_mm256_adds_epu16(_pC, _mm256_min_epu16(_min_Lp_r, _Lp_r_far)), _LGmin); 304 | _pS = _mm256_adds_epu16(_pS, _path_cost); 305 | 306 | _mm256_store_si256(reinterpret_cast<__m256i*>(pStmp), _pS); 307 | _mm256_store_si256(reinterpret_cast<__m256i*>(path_vector_tmp), _path_cost); 308 | 309 | pCtmp += 16; 310 | min_Lp_r_tmp += 16; 311 | path_vector_tmp += 16; 312 | pStmp += 16; 313 | } 314 | 315 | return; 316 | } 317 | 318 | Loop<0, DInt>::EvaluateMin(min_Lp_r.get(), LGmin, path_vector, m_P1); 319 | 320 | for (auto d = 0; d < DInt; d++) 321 | { 322 | auto path_cost = pC[d] + Min(min_Lp_r[d], static_cast(LGmin + m_P2)) - LGmin; 323 | pS[d] += path_cost; 324 | path_vector[d] = path_cost; 325 | } 326 | } 327 | 328 | template 329 | inline void ForwardPass() noexcept 330 | { 331 | // top left corner 332 | UpdatePath<0, true, WithAVX2>(S.get(), C.get(), 0); 333 | UpdatePath<1, true, WithAVX2>(S.get(), C.get(), 0); 334 | UpdatePath<2, true, WithAVX2>(S.get(), C.get(), 0); 335 | UpdatePath<3, true, WithAVX2>(S.get(), C.get(), 0); 336 | 337 | // first line 338 | for (auto ix = 1; ix < Width; ix++) 339 | { 340 | UpdatePath<0, false, WithAVX2>(S.get() + ix * DInt, C.get() + ix * DInt, ix); 341 | UpdatePath<1, true, WithAVX2>(S.get() + ix * DInt, C.get() + ix * DInt, ix); 342 | UpdatePath<2, true, WithAVX2>(S.get() + ix * DInt, C.get() + ix * DInt, ix); 343 | UpdatePath<3, true, WithAVX2>(S.get() + ix * DInt, C.get() + ix * DInt, ix); 344 | } 345 | 346 | for (auto iy = 1; iy < Height; iy++) 347 | { 348 | // first pixel of the line 349 | auto idy = Width * iy; 350 | 351 | UpdatePath<0, true, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 352 | UpdatePath<1, true, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 353 | UpdatePath<2, false, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 354 | UpdatePath<3, false, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 355 | 356 | for (auto ix = 1; ix < Width; ix++) 357 | { 358 | auto idx = ix + idy; 359 | UpdatePath<0, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 360 | UpdatePath<1, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 361 | UpdatePath<2, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 362 | UpdatePath<3, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 363 | } 364 | } 365 | } 366 | 367 | template 368 | inline void BackwardPass() noexcept 369 | { 370 | auto last = Width * Height - 1; 371 | 372 | // bottom right corner 373 | UpdatePath<0, true, WithAVX2>(S.get() + last * DInt, C.get() + last * DInt, 0); 374 | UpdatePath<1, true, WithAVX2>(S.get() + last * DInt, C.get() + last * DInt, 0); 375 | UpdatePath<2, true, WithAVX2>(S.get() + last * DInt, C.get() + last * DInt, 0); 376 | UpdatePath<3, true, WithAVX2>(S.get() + last * DInt, C.get() + last * DInt, 0); 377 | 378 | // last line 379 | for (auto ix = 1; ix < Width; ix++) 380 | { 381 | UpdatePath<0, false, WithAVX2>(S.get() + (last - ix) * DInt, C.get() + (last - ix) * DInt, ix); 382 | UpdatePath<1, true, WithAVX2>(S.get() + (last - ix) * DInt, C.get() + (last - ix) * DInt, ix); 383 | UpdatePath<2, true, WithAVX2>(S.get() + (last - ix) * DInt, C.get() + (last - ix) * DInt, ix); 384 | UpdatePath<3, true, WithAVX2>(S.get() + (last - ix) * DInt, C.get() + (last - ix) * DInt, ix); 385 | } 386 | 387 | for (auto iy = 1; iy < Height; iy++) 388 | { 389 | // first pixel of the line 390 | auto idy = last - Width * iy; 391 | 392 | UpdatePath<0, true, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 393 | UpdatePath<1, true, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 394 | UpdatePath<2, false, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 395 | UpdatePath<3, false, WithAVX2>(S.get() + idy * DInt, C.get() + idy * DInt, 0); 396 | 397 | for (auto ix = 1; ix < Width; ix++) 398 | { 399 | auto idx = idy - ix; 400 | UpdatePath<0, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 401 | UpdatePath<1, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 402 | UpdatePath<2, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 403 | UpdatePath<3, false, WithAVX2>(S.get() + idx * DInt, C.get() + idx * DInt, ix); 404 | } 405 | } 406 | } 407 | }; 408 | 409 | } // namespace sgm 410 | --------------------------------------------------------------------------------