├── .gitignore ├── CMakeLists.txt ├── CPU ├── descriptor.cpp ├── descriptor.h ├── elas.cpp ├── elas.h ├── filter.cpp ├── filter.h ├── image.h ├── matrix.cpp ├── matrix.h ├── timer.h ├── triangle.cpp └── triangle.h ├── GPU ├── RUN ├── elas_gpu.cu └── elas_gpu.h ├── GPU_test ├── 2016_12_06_cpu │ ├── aloe_left_disp.pgm │ ├── aloe_right_disp.pgm │ ├── cones_left_disp.pgm │ ├── cones_right_disp.pgm │ ├── raindeer_left_disp.pgm │ ├── raindeer_right_disp.pgm │ ├── urban1_left_disp.pgm │ ├── urban1_right_disp.pgm │ ├── urban2_left_disp.pgm │ ├── urban2_right_disp.pgm │ ├── urban3_left_disp.pgm │ ├── urban3_right_disp.pgm │ ├── urban4_left_disp.pgm │ └── urban4_right_disp.pgm └── 2016_12_06_gpu │ ├── aloe_left_disp.pgm │ ├── aloe_right_disp.pgm │ ├── cones_left_disp.pgm │ ├── cones_right_disp.pgm │ ├── raindeer_left_disp.pgm │ ├── raindeer_right_disp.pgm │ ├── urban1_left_disp.pgm │ ├── urban1_right_disp.pgm │ ├── urban2_left_disp.pgm │ ├── urban2_right_disp.pgm │ ├── urban3_left_disp.pgm │ ├── urban3_right_disp.pgm │ ├── urban4_left_disp.pgm │ └── urban4_right_disp.pgm ├── LICENSE ├── ReadMe.md ├── input ├── aloe_left.pgm ├── aloe_right.pgm ├── cones_left.pgm ├── cones_right.pgm ├── raindeer_left.pgm ├── raindeer_right.pgm ├── urban1_left.pgm ├── urban1_right.pgm ├── urban2_left.pgm ├── urban2_right.pgm ├── urban3_left.pgm ├── urban3_right.pgm ├── urban4_left.pgm └── urban4_right.pgm ├── main_cpu.cpp ├── main_gpu.cu ├── main_test.cpp ├── references ├── 2010ACCV_Geiger.pdf ├── 2016IROS_Maddern.pdf └── StereoNotes.pdf └── reports ├── 2016_ELEG655_project_presentation.pdf ├── 2016_ELEG655_project_propsal.pdf ├── 2016_ELEG655_project_report.pdf └── project.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | output/* 3 | !output/.gitkeep 4 | .vscode 5 | input/*_disp.pgm -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.8) 2 | 3 | # Project name 4 | project(libelas-gpu) 5 | 6 | # Include our cmake files 7 | # SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/) 8 | 9 | # Include libraries 10 | find_package(CUDA) 11 | # find_package(Eigen3 REQUIRED) 12 | # find_package(Boost REQUIRED COMPONENTS system serialization system filesystem thread program_options date_time regex timer chrono) 13 | # find_package(OpenCV 3 REQUIRED core imgcodecs videoio ximgproc) 14 | 15 | 16 | # Try to compile with c++11 17 | # http://stackoverflow.com/a/25836953 18 | include(CheckCXXCompilerFlag) 19 | CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) 20 | CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X) 21 | if(COMPILER_SUPPORTS_CXX11) 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 23 | elseif(COMPILER_SUPPORTS_CXX0X) 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") 25 | else() 26 | message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") 27 | endif() 28 | 29 | # Enable compile optimizations 30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 31 | 32 | # Enable debug flags 33 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g3 -Wall") 34 | 35 | # Enable SSE3 cpu commands 36 | # If we are on ARM we need to find an alternative 37 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3") 38 | 39 | # Set our nvcc flags 40 | # http://stackoverflow.com/a/13244930 41 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch sm_20 -gencode arch=compute_20,code=sm_20) 42 | 43 | 44 | # Include our header files 45 | include_directories(CPU GPU GPU_test) 46 | 47 | # Include all source files in each sub-directory 48 | # file(GLOB_RECURSE sources_cpu CPU/*.cpp) 49 | # file(GLOB_RECURSE sources_gpu GPU/*.cu) 50 | # file(GLOB_RECURSE sources_cpu GPU_test/*.cpp) 51 | 52 | 53 | # Set files the build 54 | set(sources_cpu 55 | CPU/descriptor.cpp 56 | CPU/elas.cpp 57 | CPU/filter.cpp 58 | CPU/matrix.cpp 59 | CPU/triangle.cpp 60 | ) 61 | 62 | # Set files the build 63 | set(sources_gpu 64 | GPU/elas_gpu.cu 65 | ) 66 | 67 | 68 | # CPU binary 69 | add_executable(libelas_cpu main_cpu.cpp ${sources_cpu}) 70 | 71 | # GPU binary, convert the gpu code to cpu code, then build 72 | cuda_compile(sources_gpu_built main_gpu.cu ${sources_gpu}) 73 | cuda_add_executable(libelas_gpu ${sources_gpu_built} ${sources_cpu}) 74 | 75 | # Testing binary 76 | add_executable(libelas_test main_test.cpp ${sources_cpu}) 77 | -------------------------------------------------------------------------------- /CPU/descriptor.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | #include "descriptor.h" 23 | #include "filter.h" 24 | #include 25 | 26 | using namespace std; 27 | 28 | Descriptor::Descriptor(uint8_t* I,int32_t width,int32_t height,int32_t bpl,bool half_resolution) { 29 | I_desc = (uint8_t*)_mm_malloc(16*width*height*sizeof(uint8_t),16); 30 | uint8_t* I_du = (uint8_t*)_mm_malloc(bpl*height*sizeof(uint8_t),16); 31 | uint8_t* I_dv = (uint8_t*)_mm_malloc(bpl*height*sizeof(uint8_t),16); 32 | //Filter call so sobel filter to get lines better 33 | filter::sobel3x3(I,I_du,I_dv,bpl,height); 34 | //Create 16 byte discriptors for each deep image pixel 35 | createDescriptor(I_du,I_dv,width,height,bpl,half_resolution); 36 | _mm_free(I_du); 37 | _mm_free(I_dv); 38 | } 39 | 40 | Descriptor::~Descriptor() { 41 | _mm_free(I_desc); 42 | } 43 | 44 | void Descriptor::createDescriptor (uint8_t* I_du,uint8_t* I_dv,int32_t width,int32_t height,int32_t bpl,bool half_resolution) { 45 | 46 | uint8_t *I_desc_curr; 47 | uint32_t addr_v0,addr_v1,addr_v2,addr_v3,addr_v4; 48 | 49 | // do not compute every second line 50 | if (half_resolution) { 51 | 52 | // create filter strip 53 | for (int32_t v=4; v 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | // Define fixed-width datatypes for Visual Studio projects 36 | #ifndef _MSC_VER 37 | #include 38 | #else 39 | typedef __int8 int8_t; 40 | typedef __int16 int16_t; 41 | typedef __int32 int32_t; 42 | typedef __int64 int64_t; 43 | typedef unsigned __int8 uint8_t; 44 | typedef unsigned __int16 uint16_t; 45 | typedef unsigned __int32 uint32_t; 46 | typedef unsigned __int64 uint64_t; 47 | #endif 48 | 49 | class Descriptor { 50 | 51 | public: 52 | 53 | // constructor creates filters 54 | Descriptor(uint8_t* I,int32_t width,int32_t height,int32_t bpl,bool half_resolution); 55 | 56 | // deconstructor releases memory 57 | ~Descriptor(); 58 | 59 | // descriptors accessible from outside 60 | uint8_t* I_desc; 61 | 62 | private: 63 | 64 | // build descriptor I_desc from I_du and I_dv 65 | void createDescriptor(uint8_t* I_du,uint8_t* I_dv,int32_t width,int32_t height,int32_t bpl,bool half_resolution); 66 | 67 | }; 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /CPU/elas.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | // Main header file. Include this to use libelas in your code. 23 | 24 | #ifndef __ELAS_H__ 25 | #define __ELAS_H__ 26 | 27 | // Enable profiling 28 | #define PROFILE 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | // define fixed-width datatypes for Visual Studio projects 38 | #ifndef _MSC_VER 39 | #include 40 | #else 41 | typedef __int8 int8_t; 42 | typedef __int16 int16_t; 43 | typedef __int32 int32_t; 44 | typedef __int64 int64_t; 45 | typedef unsigned __int8 uint8_t; 46 | typedef unsigned __int16 uint16_t; 47 | typedef unsigned __int32 uint32_t; 48 | typedef unsigned __int64 uint64_t; 49 | #endif 50 | 51 | #ifdef PROFILE 52 | #include "timer.h" 53 | #endif 54 | 55 | class Elas { 56 | 57 | public: 58 | 59 | enum setting {ROBOTICS,MIDDLEBURY}; 60 | 61 | // parameter settings 62 | struct parameters { 63 | int32_t disp_min; // min disparity 64 | int32_t disp_max; // max disparity 65 | float support_threshold; // max. uniqueness ratio (best vs. second best support match) 66 | int32_t support_texture; // min texture for support points 67 | int32_t candidate_stepsize; // step size of regular grid on which support points are matched 68 | int32_t incon_window_size; // window size of inconsistent support point check 69 | int32_t incon_threshold; // disparity similarity threshold for support point to be considered consistent 70 | int32_t incon_min_support; // minimum number of consistent support points 71 | bool add_corners; // add support points at image corners with nearest neighbor disparities 72 | int32_t grid_size; // size of neighborhood for additional support point extrapolation 73 | float beta; // image likelihood parameter 74 | float gamma; // prior constant 75 | float sigma; // prior sigma 76 | float sradius; // prior sigma radius 77 | int32_t match_texture; // min texture for dense matching 78 | int32_t lr_threshold; // disparity threshold for left/right consistency check 79 | float speckle_sim_threshold; // similarity threshold for speckle segmentation 80 | int32_t speckle_size; // maximal size of a speckle (small speckles get removed) 81 | int32_t ipol_gap_width; // interpolate small gaps (left<->right, top<->bottom) 82 | bool filter_median; // optional median filter (approximated) 83 | bool filter_adaptive_mean; // optional adaptive mean filter (approximated) 84 | bool postprocess_only_left; // saves time by not postprocessing the right image 85 | bool subsampling; // saves time by only computing disparities for each 2nd pixel 86 | // note: for this option D1 and D2 must be passed with size 87 | // width/2 x height/2 (rounded towards zero) 88 | 89 | // constructor 90 | parameters (setting s=ROBOTICS) { 91 | 92 | // default settings in a robotics environment 93 | // (do not produce results in half-occluded areas 94 | // and are a bit more robust towards lighting etc.) 95 | if (s==ROBOTICS) { 96 | disp_min = 0; 97 | disp_max = 255; 98 | support_threshold = 0.85; 99 | support_texture = 10; 100 | candidate_stepsize = 5; 101 | incon_window_size = 5; 102 | incon_threshold = 5; 103 | incon_min_support = 5; 104 | add_corners = 0; 105 | grid_size = 20; 106 | beta = 0.02; 107 | gamma = 3; 108 | sigma = 1; 109 | sradius = 2; 110 | match_texture = 1; 111 | lr_threshold = 2; 112 | speckle_sim_threshold = 1; 113 | speckle_size = 200; 114 | ipol_gap_width = 3; 115 | filter_median = 0; 116 | filter_adaptive_mean = 1; 117 | postprocess_only_left = 1; 118 | subsampling = 0; 119 | 120 | // default settings for middlebury benchmark 121 | // (interpolate all missing disparities) 122 | } else { 123 | disp_min = 0; 124 | disp_max = 255; 125 | support_threshold = 0.95; 126 | support_texture = 10; 127 | candidate_stepsize = 5; 128 | incon_window_size = 5; 129 | incon_threshold = 5; 130 | incon_min_support = 5; 131 | add_corners = 1; 132 | grid_size = 20; 133 | beta = 0.02; 134 | gamma = 5; 135 | sigma = 1; 136 | sradius = 3; 137 | match_texture = 0; 138 | lr_threshold = 2; 139 | speckle_sim_threshold = 1; 140 | speckle_size = 200; 141 | ipol_gap_width = 5000; 142 | filter_median = 1; 143 | filter_adaptive_mean = 0; 144 | postprocess_only_left = 0; 145 | subsampling = 0; 146 | } 147 | } 148 | }; 149 | 150 | // constructor, input: parameters 151 | Elas (parameters param) : param(param) {} 152 | 153 | // deconstructor 154 | ~Elas () {} 155 | 156 | // matching function 157 | // inputs: pointers to left (I1) and right (I2) intensity image (uint8, input) 158 | // pointers to left (D1) and right (D2) disparity image (float, output) 159 | // dims[0] = width of I1 and I2 160 | // dims[1] = height of I1 and I2 161 | // dims[2] = bytes per line (often equal to width, but allowed to differ) 162 | // note: D1 and D2 must be allocated before (bytes per line = width) 163 | // if subsampling is not active their size is width x height, 164 | // otherwise width/2 x height/2 (rounded towards zero) 165 | void process (uint8_t* I1,uint8_t* I2,float* D1,float* D2,const int32_t* dims); 166 | 167 | // This was originally "private" 168 | // Was converted to allow sub-classes to call this 169 | // This assumes the user knows what they are doing 170 | public: 171 | 172 | struct support_pt { 173 | int32_t u; 174 | int32_t v; 175 | int32_t d; 176 | support_pt(int32_t u,int32_t v,int32_t d):u(u),v(v),d(d){} 177 | }; 178 | 179 | struct triangle { 180 | int32_t c1,c2,c3; 181 | float t1a,t1b,t1c; 182 | float t2a,t2b,t2c; 183 | triangle(int32_t c1,int32_t c2,int32_t c3):c1(c1),c2(c2),c3(c3){} 184 | }; 185 | 186 | inline uint32_t getAddressOffsetImage (const int32_t& u,const int32_t& v,const int32_t& width) { 187 | return v*width+u; 188 | } 189 | 190 | inline uint32_t getAddressOffsetGrid (const int32_t& x,const int32_t& y,const int32_t& d,const int32_t& width,const int32_t& disp_num) { 191 | return (y*width+x)*disp_num+d; 192 | } 193 | 194 | // support point functions 195 | virtual void removeInconsistentSupportPoints (int16_t* D_can,int32_t D_can_width,int32_t D_can_height); 196 | virtual void removeRedundantSupportPoints (int16_t* D_can,int32_t D_can_width,int32_t D_can_height, 197 | int32_t redun_max_dist, int32_t redun_threshold, bool vertical); 198 | virtual void addCornerSupportPoints (std::vector &p_support); 199 | inline int16_t computeMatchingDisparity (const int32_t &u,const int32_t &v,uint8_t* I1_desc,uint8_t* I2_desc,const bool &right_image); 200 | virtual std::vector computeSupportMatches (uint8_t* I1_desc,uint8_t* I2_desc); 201 | 202 | // triangulation & grid 203 | virtual std::vector computeDelaunayTriangulation (std::vector p_support,int32_t right_image); 204 | virtual void computeDisparityPlanes (std::vector p_support,std::vector &tri,int32_t right_image); 205 | virtual void createGrid (std::vector p_support,int32_t* disparity_grid,int32_t* grid_dims,bool right_image); 206 | 207 | // matching 208 | inline void updatePosteriorMinimum (__m128i* I2_block_addr,const int32_t &d,const int32_t &w, 209 | const __m128i &xmm1,__m128i &xmm2,int32_t &val,int32_t &min_val,int32_t &min_d); 210 | inline void updatePosteriorMinimum (__m128i* I2_block_addr,const int32_t &d, 211 | const __m128i &xmm1,__m128i &xmm2,int32_t &val,int32_t &min_val,int32_t &min_d); 212 | inline void findMatch (int32_t &u,int32_t &v,float &plane_a,float &plane_b,float &plane_c, 213 | int32_t* disparity_grid,int32_t *grid_dims,uint8_t* I1_desc,uint8_t* I2_desc, 214 | int32_t *P,int32_t &plane_radius,bool &valid,bool &right_image,float* D); 215 | virtual void computeDisparity (std::vector p_support,std::vector tri,int32_t* disparity_grid,int32_t* grid_dims, 216 | uint8_t* I1_desc,uint8_t* I2_desc,bool right_image,float* D); 217 | 218 | // L/R consistency check 219 | virtual void leftRightConsistencyCheck (float* D1,float* D2); 220 | 221 | // postprocessing 222 | virtual void removeSmallSegments (float* D); 223 | virtual void gapInterpolation (float* D); 224 | 225 | // optional postprocessing 226 | virtual void adaptiveMean (float* D); 227 | virtual void median (float* D); 228 | 229 | // parameter set 230 | parameters param; 231 | 232 | // memory aligned input images + dimensions 233 | uint8_t *I1,*I2; 234 | int32_t width,height,bpl; 235 | 236 | // profiling timer 237 | #ifdef PROFILE 238 | Timer timer; 239 | #endif 240 | }; 241 | 242 | #endif 243 | -------------------------------------------------------------------------------- /CPU/filter.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Julius Ziegler, Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | #include "filter.h" 27 | 28 | // define fixed-width datatypes for Visual Studio projects 29 | #ifndef _MSC_VER 30 | #include 31 | #else 32 | typedef __int8 int8_t; 33 | typedef __int16 int16_t; 34 | typedef __int32 int32_t; 35 | typedef __int64 int64_t; 36 | typedef unsigned __int8 uint8_t; 37 | typedef unsigned __int16 uint16_t; 38 | typedef unsigned __int32 uint32_t; 39 | typedef unsigned __int64 uint64_t; 40 | #endif 41 | 42 | // fast filters: implements 3x3 and 5x5 sobel filters and 43 | // 5x5 blob and corner filters based on SSE2/3 instructions 44 | namespace filter { 45 | 46 | // private namespace, public user functions at the bottom of this file 47 | namespace detail { 48 | void integral_image( const uint8_t* in, int32_t* out, int w, int h ) { 49 | int32_t* out_top = out; 50 | const uint8_t* line_end = in + w; 51 | const uint8_t* in_end = in + w*h; 52 | int32_t line_sum = 0; 53 | for( ; in != line_end; in++, out++ ) { 54 | line_sum += *in; 55 | *out = line_sum; 56 | } 57 | for( ; in != in_end; ) { 58 | int32_t line_sum = 0; 59 | const uint8_t* line_end = in + w; 60 | for( ; in != line_end; in++, out++, out_top++ ) { 61 | line_sum += *in; 62 | *out = *out_top + line_sum; 63 | } 64 | } 65 | } 66 | 67 | void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) { 68 | __m128i zero = _mm_setzero_si128(); 69 | b0 = _mm_unpacklo_epi8( a, zero ); 70 | b1 = _mm_unpackhi_epi8( a, zero ); 71 | } 72 | 73 | void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) { 74 | b = _mm_packus_epi16( a0, a1 ); 75 | } 76 | 77 | // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output. 78 | // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255]. 79 | void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) { 80 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 81 | const __m128i* i0 = (const __m128i*)(in); 82 | const int16_t* i1 = in+1; 83 | const int16_t* i2 = in+2; 84 | const int16_t* i3 = in+3; 85 | const int16_t* i4 = in+4; 86 | uint8_t* result = out + 2; 87 | const int16_t* const end_input = in + w*h; 88 | __m128i offs = _mm_set1_epi16( 128 ); 89 | for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) { 90 | __m128i result_register_lo; 91 | __m128i result_register_hi; 92 | for( int i=0; i<2; i++ ) { 93 | __m128i* result_register; 94 | if( i==0 ) result_register = &result_register_lo; 95 | else result_register = &result_register_hi; 96 | __m128i i0_register = *i0; 97 | __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) ); 98 | __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) ); 99 | __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) ); 100 | __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) ); 101 | *result_register = _mm_setzero_si128(); 102 | *result_register = _mm_add_epi16( i0_register, *result_register ); 103 | i1_register = _mm_add_epi16( i1_register, i1_register ); 104 | i1_register = _mm_add_epi16( i1_register, i1_register ); 105 | *result_register = _mm_add_epi16( i1_register, *result_register ); 106 | i2_register = _mm_add_epi16( i2_register, i2_register ); 107 | *result_register = _mm_add_epi16( i2_register, *result_register ); 108 | i2_register = _mm_add_epi16( i2_register, i2_register ); 109 | *result_register = _mm_add_epi16( i2_register, *result_register ); 110 | i3_register = _mm_add_epi16( i3_register, i3_register ); 111 | i3_register = _mm_add_epi16( i3_register, i3_register ); 112 | *result_register = _mm_add_epi16( i3_register, *result_register ); 113 | *result_register = _mm_add_epi16( i4_register, *result_register ); 114 | *result_register = _mm_srai_epi16( *result_register, 7 ); 115 | *result_register = _mm_add_epi16( *result_register, offs ); 116 | if( i==0 ) { 117 | i0 += 1; 118 | i1 += 8; 119 | i2 += 8; 120 | i3 += 8; 121 | i4 += 8; 122 | } 123 | } 124 | pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo ); 125 | _mm_storeu_si128( ((__m128i*)( result )), result_register_lo ); 126 | } 127 | } 128 | 129 | // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output. 130 | // This one works on 16bit input and 8bit output. 131 | // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255]. 132 | void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) { 133 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 134 | const __m128i* i0 = (const __m128i*)(in); 135 | const int16_t* i1 = in+1; 136 | const int16_t* i3 = in+3; 137 | const int16_t* i4 = in+4; 138 | uint8_t* result = out + 2; 139 | const int16_t* const end_input = in + w*h; 140 | __m128i offs = _mm_set1_epi16( 128 ); 141 | for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) { 142 | __m128i result_register_lo; 143 | __m128i result_register_hi; 144 | for( int i=0; i<2; i++ ) { 145 | __m128i* result_register; 146 | if( i==0 ) result_register = &result_register_lo; 147 | else result_register = &result_register_hi; 148 | __m128i i0_register = *i0; 149 | __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) ); 150 | __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) ); 151 | __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) ); 152 | *result_register = _mm_setzero_si128(); 153 | *result_register = _mm_add_epi16( i0_register, *result_register ); 154 | i1_register = _mm_add_epi16( i1_register, i1_register ); 155 | *result_register = _mm_add_epi16( i1_register, *result_register ); 156 | i3_register = _mm_add_epi16( i3_register, i3_register ); 157 | *result_register = _mm_sub_epi16( *result_register, i3_register ); 158 | *result_register = _mm_sub_epi16( *result_register, i4_register ); 159 | *result_register = _mm_srai_epi16( *result_register, 7 ); 160 | *result_register = _mm_add_epi16( *result_register, offs ); 161 | if( i==0 ) { 162 | i0 += 1; 163 | i1 += 8; 164 | i3 += 8; 165 | i4 += 8; 166 | } 167 | } 168 | pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo ); 169 | _mm_storeu_si128( ((__m128i*)( result )), result_register_lo ); 170 | } 171 | } 172 | 173 | // convolve image with a (1,2,1) row vector. Result is accumulated into output. 174 | // This one works on 16bit input and 8bit output. 175 | // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255]. 176 | void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) { 177 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 178 | const __m128i* i0 = (const __m128i*)(in); 179 | const int16_t* i1 = in+1; 180 | const int16_t* i2 = in+2; 181 | uint8_t* result = out + 1; 182 | const int16_t* const end_input = in + w*h; 183 | const size_t blocked_loops = (w*h-2)/16; 184 | __m128i offs = _mm_set1_epi16( 128 ); 185 | for( size_t i=0; i != blocked_loops; i++ ) { 186 | __m128i result_register_lo; 187 | __m128i result_register_hi; 188 | __m128i i1_register; 189 | __m128i i2_register; 190 | 191 | i1_register = _mm_loadu_si128( (__m128i*)( i1 ) ); 192 | i2_register = _mm_loadu_si128( (__m128i*)( i2 ) ); 193 | result_register_lo = *i0; 194 | i1_register = _mm_add_epi16( i1_register, i1_register ); 195 | result_register_lo = _mm_add_epi16( i1_register, result_register_lo ); 196 | result_register_lo = _mm_add_epi16( i2_register, result_register_lo ); 197 | result_register_lo = _mm_srai_epi16( result_register_lo, 2 ); 198 | result_register_lo = _mm_add_epi16( result_register_lo, offs ); 199 | 200 | i0++; 201 | i1+=8; 202 | i2+=8; 203 | 204 | i1_register = _mm_loadu_si128( (__m128i*)( i1 ) ); 205 | i2_register = _mm_loadu_si128( (__m128i*)( i2 ) ); 206 | result_register_hi = *i0; 207 | i1_register = _mm_add_epi16( i1_register, i1_register ); 208 | result_register_hi = _mm_add_epi16( i1_register, result_register_hi ); 209 | result_register_hi = _mm_add_epi16( i2_register, result_register_hi ); 210 | result_register_hi = _mm_srai_epi16( result_register_hi, 2 ); 211 | result_register_hi = _mm_add_epi16( result_register_hi, offs ); 212 | 213 | i0++; 214 | i1+=8; 215 | i2+=8; 216 | 217 | pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo ); 218 | _mm_storeu_si128( ((__m128i*)( result )), result_register_lo ); 219 | 220 | result += 16; 221 | } 222 | } 223 | 224 | // convolve image with a (1,0,-1) row vector. Result is accumulated into output. 225 | // This one works on 16bit input and 8bit output. 226 | // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255]. 227 | void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) { 228 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 229 | const __m128i* i0 = (const __m128i*)(in); 230 | const int16_t* i2 = in+2; 231 | uint8_t* result = out + 1; 232 | const int16_t* const end_input = in + w*h; 233 | const size_t blocked_loops = (w*h-2)/16; 234 | __m128i offs = _mm_set1_epi16( 128 ); 235 | for( size_t i=0; i != blocked_loops; i++ ) { 236 | __m128i result_register_lo; 237 | __m128i result_register_hi; 238 | __m128i i2_register; 239 | 240 | i2_register = _mm_loadu_si128( (__m128i*)( i2 ) ); 241 | result_register_lo = *i0; 242 | result_register_lo = _mm_sub_epi16( result_register_lo, i2_register ); 243 | result_register_lo = _mm_srai_epi16( result_register_lo, 2 ); 244 | result_register_lo = _mm_add_epi16( result_register_lo, offs ); 245 | 246 | i0 += 1; 247 | i2 += 8; 248 | 249 | i2_register = _mm_loadu_si128( (__m128i*)( i2 ) ); 250 | result_register_hi = *i0; 251 | result_register_hi = _mm_sub_epi16( result_register_hi, i2_register ); 252 | result_register_hi = _mm_srai_epi16( result_register_hi, 2 ); 253 | result_register_hi = _mm_add_epi16( result_register_hi, offs ); 254 | 255 | i0 += 1; 256 | i2 += 8; 257 | 258 | pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo ); 259 | _mm_storeu_si128( ((__m128i*)( result )), result_register_lo ); 260 | 261 | result += 16; 262 | } 263 | 264 | for( ; i2 < end_input; i2++, result++) { 265 | *result = ((*(i2-2) - *i2)>>2)+128; 266 | } 267 | } 268 | 269 | void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) { 270 | using namespace std; 271 | memset( out_h, 0, w*h*sizeof(int16_t) ); 272 | memset( out_v, 0, w*h*sizeof(int16_t) ); 273 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 274 | const int w_chunk = w/16; 275 | __m128i* i0 = (__m128i*)( in ); 276 | __m128i* i1 = (__m128i*)( in ) + w_chunk*1; 277 | __m128i* i2 = (__m128i*)( in ) + w_chunk*2; 278 | __m128i* i3 = (__m128i*)( in ) + w_chunk*3; 279 | __m128i* i4 = (__m128i*)( in ) + w_chunk*4; 280 | __m128i* result_h = (__m128i*)( out_h ) + 4*w_chunk; 281 | __m128i* result_v = (__m128i*)( out_v ) + 4*w_chunk; 282 | __m128i* end_input = (__m128i*)( in ) + w_chunk*h; 283 | __m128i sixes = _mm_set1_epi16( 6 ); 284 | __m128i fours = _mm_set1_epi16( 4 ); 285 | for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) { 286 | __m128i ilo, ihi; 287 | unpack_8bit_to_16bit( *i0, ihi, ilo ); 288 | *result_h = _mm_add_epi16( ihi, *result_h ); 289 | *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) ); 290 | *result_v = _mm_add_epi16( *result_v, ihi ); 291 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 292 | unpack_8bit_to_16bit( *i1, ihi, ilo ); 293 | *result_h = _mm_add_epi16( ihi, *result_h ); 294 | *result_h = _mm_add_epi16( ihi, *result_h ); 295 | *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) ); 296 | *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) ); 297 | ihi = _mm_mullo_epi16( ihi, fours ); 298 | ilo = _mm_mullo_epi16( ilo, fours ); 299 | *result_v = _mm_add_epi16( *result_v, ihi ); 300 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 301 | unpack_8bit_to_16bit( *i2, ihi, ilo ); 302 | ihi = _mm_mullo_epi16( ihi, sixes ); 303 | ilo = _mm_mullo_epi16( ilo, sixes ); 304 | *result_v = _mm_add_epi16( *result_v, ihi ); 305 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 306 | unpack_8bit_to_16bit( *i3, ihi, ilo ); 307 | *result_h = _mm_sub_epi16( *result_h, ihi ); 308 | *result_h = _mm_sub_epi16( *result_h, ihi ); 309 | *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo ); 310 | *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo ); 311 | ihi = _mm_mullo_epi16( ihi, fours ); 312 | ilo = _mm_mullo_epi16( ilo, fours ); 313 | *result_v = _mm_add_epi16( *result_v, ihi ); 314 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 315 | unpack_8bit_to_16bit( *i4, ihi, ilo ); 316 | *result_h = _mm_sub_epi16( *result_h, ihi ); 317 | *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo ); 318 | *result_v = _mm_add_epi16( *result_v, ihi ); 319 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 320 | } 321 | } 322 | 323 | void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) { 324 | memset( out, 0, w*h*sizeof(int16_t) ); 325 | using namespace std; 326 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 327 | const int w_chunk = w/16; 328 | __m128i* i0 = (__m128i*)( in ); 329 | __m128i* i1 = (__m128i*)( in ) + w_chunk*1; 330 | __m128i* i3 = (__m128i*)( in ) + w_chunk*3; 331 | __m128i* i4 = (__m128i*)( in ) + w_chunk*4; 332 | __m128i* result = (__m128i*)( out ) + 4*w_chunk; 333 | __m128i* end_input = (__m128i*)( in ) + w_chunk*h; 334 | for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) { 335 | __m128i ilo0, ihi0; 336 | unpack_8bit_to_16bit( *i0, ihi0, ilo0 ); 337 | __m128i ilo1, ihi1; 338 | unpack_8bit_to_16bit( *i1, ihi1, ilo1 ); 339 | *result = _mm_add_epi16( ihi0, ihi1 ); 340 | *(result+1) = _mm_add_epi16( ilo0, ilo1 ); 341 | __m128i ilo, ihi; 342 | unpack_8bit_to_16bit( *i3, ihi, ilo ); 343 | *result = _mm_sub_epi16( *result, ihi ); 344 | *(result+1) = _mm_sub_epi16( *(result+1), ilo ); 345 | unpack_8bit_to_16bit( *i4, ihi, ilo ); 346 | *result = _mm_sub_epi16( *result, ihi ); 347 | *(result+1) = _mm_sub_epi16( *(result+1), ilo ); 348 | } 349 | } 350 | 351 | void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) { 352 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 353 | const __m128i* i0 = (const __m128i*)(in); 354 | const int16_t* i1 = in+1; 355 | const int16_t* i3 = in+3; 356 | const int16_t* i4 = in+4; 357 | int16_t* result = out + 2; 358 | const int16_t* const end_input = in + w*h; 359 | for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) { 360 | __m128i result_register; 361 | __m128i i0_register = *i0; 362 | __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) ); 363 | __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) ); 364 | __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) ); 365 | result_register = _mm_add_epi16( i0_register, i1_register ); 366 | result_register = _mm_sub_epi16( result_register, i3_register ); 367 | result_register = _mm_sub_epi16( result_register, i4_register ); 368 | _mm_storeu_si128( ((__m128i*)( result )), result_register ); 369 | } 370 | } 371 | 372 | void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) { 373 | using namespace std; 374 | assert( w % 16 == 0 && "width must be multiple of 16!" ); 375 | const int w_chunk = w/16; 376 | __m128i* i0 = (__m128i*)( in ); 377 | __m128i* i1 = (__m128i*)( in ) + w_chunk*1; 378 | __m128i* i2 = (__m128i*)( in ) + w_chunk*2; 379 | __m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk; 380 | __m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk; 381 | __m128i* end_input = (__m128i*)( in ) + w_chunk*h; 382 | for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) { 383 | *result_h = _mm_setzero_si128(); 384 | *(result_h+1) = _mm_setzero_si128(); 385 | *result_v = _mm_setzero_si128(); 386 | *(result_v+1) = _mm_setzero_si128(); 387 | __m128i ilo, ihi; 388 | unpack_8bit_to_16bit( *i0, ihi, ilo ); 389 | unpack_8bit_to_16bit( *i0, ihi, ilo ); 390 | *result_h = _mm_add_epi16( ihi, *result_h ); 391 | *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) ); 392 | *result_v = _mm_add_epi16( *result_v, ihi ); 393 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 394 | unpack_8bit_to_16bit( *i1, ihi, ilo ); 395 | *result_v = _mm_add_epi16( *result_v, ihi ); 396 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 397 | *result_v = _mm_add_epi16( *result_v, ihi ); 398 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 399 | unpack_8bit_to_16bit( *i2, ihi, ilo ); 400 | *result_h = _mm_sub_epi16( *result_h, ihi ); 401 | *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo ); 402 | *result_v = _mm_add_epi16( *result_v, ihi ); 403 | *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo ); 404 | } 405 | } 406 | }; 407 | 408 | void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) { 409 | int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) ); 410 | int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) ); 411 | detail::convolve_cols_3x3( in, temp_v, temp_h, w, h ); 412 | detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h ); 413 | detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h ); 414 | _mm_free( temp_h ); 415 | _mm_free( temp_v ); 416 | } 417 | 418 | void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) { 419 | int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) ); 420 | int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) ); 421 | detail::convolve_cols_5x5( in, temp_v, temp_h, w, h ); 422 | detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h ); 423 | detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h ); 424 | _mm_free( temp_h ); 425 | _mm_free( temp_v ); 426 | } 427 | 428 | // -1 -1 0 1 1 429 | // -1 -1 0 1 1 430 | // 0 0 0 0 0 431 | // 1 1 0 -1 -1 432 | // 1 1 0 -1 -1 433 | void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) { 434 | int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) ); 435 | detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h ); 436 | detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h ); 437 | _mm_free( temp ); 438 | } 439 | 440 | // -1 -1 -1 -1 -1 441 | // -1 1 1 1 -1 442 | // -1 1 8 1 -1 443 | // -1 1 1 1 -1 444 | // -1 -1 -1 -1 -1 445 | void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) { 446 | int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) ); 447 | detail::integral_image( in, integral, w, h ); 448 | int16_t* out_ptr = out + 3 + 3*w; 449 | int16_t* out_end = out + w * h - 2 - 2*w; 450 | const int32_t* i00 = integral; 451 | const int32_t* i50 = integral + 5; 452 | const int32_t* i05 = integral + 5*w; 453 | const int32_t* i55 = integral + 5 + 5*w; 454 | const int32_t* i11 = integral + 1 + 1*w; 455 | const int32_t* i41 = integral + 4 + 1*w; 456 | const int32_t* i14 = integral + 1 + 4*w; 457 | const int32_t* i44 = integral + 4 + 4*w; 458 | const uint8_t* im22 = in + 3 + 3*w; 459 | for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) { 460 | int32_t result = 0; 461 | result = -( *i55 - *i50 - *i05 + *i00 ); 462 | result += 2*( *i44 - *i41 - *i14 + *i11 ); 463 | result += 7* *im22; 464 | *out_ptr = result; 465 | } 466 | _mm_free( integral ); 467 | } 468 | }; 469 | -------------------------------------------------------------------------------- /CPU/filter.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Julius Ziegler, Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | #ifndef __FILTER_H__ 23 | #define __FILTER_H__ 24 | 25 | #include 26 | #include 27 | 28 | // define fixed-width datatypes for Visual Studio projects 29 | #ifndef _MSC_VER 30 | #include 31 | #else 32 | typedef __int8 int8_t; 33 | typedef __int16 int16_t; 34 | typedef __int32 int32_t; 35 | typedef __int64 int64_t; 36 | typedef unsigned __int8 uint8_t; 37 | typedef unsigned __int16 uint16_t; 38 | typedef unsigned __int32 uint32_t; 39 | typedef unsigned __int64 uint64_t; 40 | #endif 41 | 42 | // fast filters: implements 3x3 and 5x5 sobel filters and 43 | // 5x5 blob and corner filters based on SSE2/3 instructions 44 | namespace filter { 45 | 46 | // private namespace, public user functions at the bottom of this file 47 | namespace detail { 48 | void integral_image( const uint8_t* in, int32_t* out, int w, int h ); 49 | void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ); 50 | void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ); 51 | 52 | // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output. 53 | // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255]. 54 | void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ); 55 | 56 | // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output. 57 | // This one works on 16bit input and 8bit output. 58 | // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255]. 59 | void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ); 60 | 61 | // convolve image with a (1,2,1) row vector. Result is accumulated into output. 62 | // This one works on 16bit input and 8bit output. 63 | // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255]. 64 | void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ); 65 | 66 | // convolve image with a (1,0,-1) row vector. Result is accumulated into output. 67 | // This one works on 16bit input and 8bit output. 68 | // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255]. 69 | void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ); 70 | 71 | void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ); 72 | 73 | void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ); 74 | 75 | void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ); 76 | 77 | void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ); 78 | } 79 | 80 | void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ); 81 | 82 | void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ); 83 | 84 | // -1 -1 0 1 1 85 | // -1 -1 0 1 1 86 | // 0 0 0 0 0 87 | // 1 1 0 -1 -1 88 | // 1 1 0 -1 -1 89 | void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ); 90 | 91 | // -1 -1 -1 -1 -1 92 | // -1 1 1 1 -1 93 | // -1 1 8 1 -1 94 | // -1 1 1 1 -1 95 | // -1 -1 -1 -1 -1 96 | void blob5x5( const uint8_t* in, int16_t* out, int w, int h ); 97 | }; 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /CPU/image.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | // basic image I/O, based on Pedro Felzenszwalb's code 23 | 24 | #ifndef IMAGE_H 25 | #define IMAGE_H 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | // use imRef to access image data. 33 | #define imRef(im, x, y) (im->access[y][x]) 34 | 35 | // use imPtr to get pointer to image data. 36 | #define imPtr(im, x, y) &(im->access[y][x]) 37 | 38 | #define BUF_SIZE 256 39 | 40 | typedef unsigned char uchar; 41 | typedef struct { uchar r, g, b; } rgb; 42 | 43 | inline bool operator==(const rgb &a, const rgb &b) { 44 | return ((a.r == b.r) && (a.g == b.g) && (a.b == b.b)); 45 | } 46 | 47 | // image class 48 | template class image { 49 | public: 50 | 51 | // create image 52 | image(const int width, const int height, const bool init = false); 53 | 54 | // delete image 55 | ~image(); 56 | 57 | // init image 58 | void init(const T &val); 59 | 60 | // deep copy 61 | image *copy() const; 62 | 63 | // get image width/height 64 | int width() const { return w; } 65 | int height() const { return h; } 66 | 67 | // image data 68 | T *data; 69 | 70 | // row pointers 71 | T **access; 72 | 73 | private: 74 | int w, h; 75 | }; 76 | 77 | template image::image(const int width, const int height, const bool init) { 78 | w = width; 79 | h = height; 80 | data = new T[w * h]; // allocate space for image data 81 | access = new T*[h]; // allocate space for row pointers 82 | 83 | // initialize row pointers 84 | for (int i = 0; i < h; i++) 85 | access[i] = data + (i * w); 86 | 87 | // init to zero 88 | if (init) 89 | memset(data, 0, w * h * sizeof(T)); 90 | } 91 | 92 | template image::~image() { 93 | delete [] data; 94 | delete [] access; 95 | } 96 | 97 | template void image::init(const T &val) { 98 | T *ptr = imPtr(this, 0, 0); 99 | T *end = imPtr(this, w-1, h-1); 100 | while (ptr <= end) 101 | *ptr++ = val; 102 | } 103 | 104 | 105 | template image *image::copy() const { 106 | image *im = new image(w, h, false); 107 | memcpy(im->data, data, w * h * sizeof(T)); 108 | return im; 109 | } 110 | 111 | class pnm_error {}; 112 | 113 | void pnm_read(std::ifstream &file, char *buf) { 114 | char doc[BUF_SIZE]; 115 | char c; 116 | 117 | file >> c; 118 | while (c == '#') { 119 | file.getline(doc, BUF_SIZE); 120 | file >> c; 121 | } 122 | file.putback(c); 123 | 124 | file.width(BUF_SIZE); 125 | file >> buf; 126 | file.ignore(); 127 | } 128 | 129 | image *loadPGM(const char *name) { 130 | char buf[BUF_SIZE]; 131 | 132 | // read header 133 | std::ifstream file(name, std::ios::in | std::ios::binary); 134 | pnm_read(file, buf); 135 | if (strncmp(buf, "P5", 2)) { 136 | std::cout << "ERROR: Could not read file " << name << std::endl; 137 | throw pnm_error(); 138 | } 139 | 140 | pnm_read(file, buf); 141 | int width = atoi(buf); 142 | pnm_read(file, buf); 143 | int height = atoi(buf); 144 | 145 | pnm_read(file, buf); 146 | if (atoi(buf) > UCHAR_MAX) { 147 | std::cout << "ERROR: Could not read file " << name << std::endl; 148 | throw pnm_error(); 149 | } 150 | 151 | // read data 152 | image *im = new image(width, height); 153 | file.read((char *)imPtr(im, 0, 0), width * height * sizeof(uchar)); 154 | 155 | return im; 156 | } 157 | 158 | void savePGM(image *im, const char *name) { 159 | int width = im->width(); 160 | int height = im->height(); 161 | std::ofstream file(name, std::ios::out | std::ios::binary); 162 | 163 | file << "P5\n" << width << " " << height << "\n" << UCHAR_MAX << "\n"; 164 | file.write((char *)imPtr(im, 0, 0), width * height * sizeof(uchar)); 165 | } 166 | 167 | #endif 168 | -------------------------------------------------------------------------------- /CPU/matrix.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libviso2. 7 | Authors: Andreas Geiger 8 | 9 | libviso2 is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 2 of the License, or any later version. 12 | 13 | libviso2 is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libviso2; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | #include "matrix.h" 23 | #include 24 | #include 25 | 26 | #define SWAP(a,b) {temp=a;a=b;b=temp;} 27 | #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) 28 | static FLOAT sqrarg; 29 | #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg) 30 | static FLOAT maxarg1,maxarg2; 31 | #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) 32 | static int32_t iminarg1,iminarg2; 33 | #define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) 34 | 35 | 36 | using namespace std; 37 | 38 | Matrix::Matrix () { 39 | m = 0; 40 | n = 0; 41 | val = 0; 42 | } 43 | 44 | Matrix::Matrix (const int32_t m_,const int32_t n_) { 45 | allocateMemory(m_,n_); 46 | } 47 | 48 | Matrix::Matrix (const int32_t m_,const int32_t n_,const FLOAT* val_) { 49 | allocateMemory(m_,n_); 50 | int32_t k=0; 51 | for (int32_t i=0; i0) 73 | for (int32_t i=0; i=m || j1<0 || j2>=n || i2m || j1+M.n>n) { 106 | cerr << "ERROR: Cannot set submatrix [" << i1 << ".." << i1+M.m-1 << 107 | "] x [" << j1 << ".." << j1+M.n-1 << "]" << 108 | " of a (" << m << "x" << n << ") matrix." << endl; 109 | exit(0); 110 | } 111 | for (int32_t i=0; i idx) { 139 | Matrix M(m,idx.size()); 140 | for (int32_t j=0; j1 && M.n==1) { 164 | Matrix D(M.m,M.m); 165 | for (int32_t i=0; i1) { 169 | Matrix D(M.n,M.n); 170 | for (int32_t i=0; i=big) { 449 | big=fabs(A.val[j][k]); 450 | irow=j; 451 | icol=k; 452 | } 453 | ++(ipiv[icol]); 454 | 455 | // We now have the pivot element, so we interchange rows, if needed, to put the pivot 456 | // element on the diagonal. The columns are not physically interchanged, only relabeled. 457 | if (irow != icol) { 458 | for (l=0;l=0;l--) { 492 | if (indxr[l]!=indxc[l]) 493 | for (k=0;kbig) 526 | big = temp; 527 | if (big == 0.0) { // No nonzero largest element. 528 | free(vv); 529 | return false; 530 | } 531 | vv[i] = 1.0/big; // Save the scaling. 532 | } 533 | for (j=0; j=big) { 547 | big = dum; 548 | imax = i; 549 | } 550 | } 551 | if (j!=imax) { // Do we need to interchange rows? 552 | for (k=0; k=0;i--) { // Accumulation of right-hand transformations. 636 | if (i=0;i--) { // Accumulation of left-hand transformations. 652 | l = i+1; 653 | g = w[i]; 654 | for (j=l;j=0;k--) { // Diagonalization of the bidiagonal form: Loop over singular values, 667 | for (its=0;its<30;its++) { // and over allowed iterations. 668 | flag = 1; 669 | for (l=k;l>=0;l--) { // Test for splitting. 670 | nm = l-1; 671 | if ((FLOAT)(fabs(rv1[l])+anorm) == anorm) { flag = 0; break; } 672 | if ((FLOAT)(fabs( w[nm])+anorm) == anorm) { break; } 673 | } 674 | if (flag) { 675 | c = 0.0; // Cancellation of rv1[l], if l > 1. 676 | s = 1.0; 677 | for (i=l;i<=k;i++) { 678 | f = s*rv1[i]; 679 | rv1[i] = c*rv1[i]; 680 | if ((FLOAT)(fabs(f)+anorm) == anorm) break; 681 | g = w[i]; 682 | h = pythag(f,g); 683 | w[i] = h; 684 | h = 1.0/h; 685 | c = g*h; 686 | s = -f*h; 687 | for (j=0;j 1); 783 | for (k=0;k (m+n)/2) { 788 | for (i=0;i absb) 847 | return absa*sqrt(1.0+SQR(absb/absa)); 848 | else 849 | return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb))); 850 | } 851 | 852 | -------------------------------------------------------------------------------- /CPU/matrix.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libviso2. 7 | Authors: Andreas Geiger 8 | 9 | libviso2 is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 2 of the License, or any later version. 12 | 13 | libviso2 is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libviso2; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | #ifndef MATRIX_H 23 | #define MATRIX_H 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #ifndef _MSC_VER 32 | #include 33 | #else 34 | typedef __int8 int8_t; 35 | typedef __int16 int16_t; 36 | typedef __int32 int32_t; 37 | typedef __int64 int64_t; 38 | typedef unsigned __int8 uint8_t; 39 | typedef unsigned __int16 uint16_t; 40 | typedef unsigned __int32 uint32_t; 41 | typedef unsigned __int64 uint64_t; 42 | #endif 43 | 44 | #define endll endl << endl // double end line definition 45 | 46 | typedef double FLOAT; // double precision 47 | //typedef float FLOAT; // single precision 48 | 49 | class Matrix { 50 | 51 | public: 52 | 53 | // constructor / deconstructor 54 | Matrix (); // init empty 0x0 matrix 55 | Matrix (const int32_t m,const int32_t n); // init empty mxn matrix 56 | Matrix (const int32_t m,const int32_t n,const FLOAT* val_); // init mxn matrix with values from array 'val' 57 | Matrix (const Matrix &M); // creates deepcopy of M 58 | ~Matrix (); 59 | 60 | // assignment operator, copies contents of M 61 | Matrix& operator= (const Matrix &M); 62 | 63 | // copies submatrix of M into array 'val', default values copy whole row/column/matrix 64 | void getData(FLOAT* val_,int32_t i1=0,int32_t j1=0,int32_t i2=-1,int32_t j2=-1); 65 | 66 | // set or get submatrices of current matrix 67 | Matrix getMat(int32_t i1,int32_t j1,int32_t i2=-1,int32_t j2=-1); 68 | void setMat(const Matrix &M,const int32_t i,const int32_t j); 69 | 70 | // set sub-matrix to scalar (default 0), -1 as end replaces whole row/column/matrix 71 | void setVal(FLOAT s,int32_t i1=0,int32_t j1=0,int32_t i2=-1,int32_t j2=-1); 72 | 73 | // set (part of) diagonal to scalar, -1 as end replaces whole diagonal 74 | void setDiag(FLOAT s,int32_t i1=0,int32_t i2=-1); 75 | 76 | // clear matrix 77 | void zero(); 78 | 79 | // extract columns with given index 80 | Matrix extractCols (std::vector idx); 81 | 82 | // create identity matrix 83 | static Matrix eye (const int32_t m); 84 | void eye (); 85 | 86 | // create diagonal matrix with nx1 or 1xn matrix M as elements 87 | static Matrix diag(const Matrix &M); 88 | 89 | // returns the m-by-n matrix whose elements are taken column-wise from M 90 | static Matrix reshape(const Matrix &M,int32_t m,int32_t n); 91 | 92 | // create 3x3 rotation matrices (convention: http://en.wikipedia.org/wiki/Rotation_matrix) 93 | static Matrix rotMatX(const FLOAT &angle); 94 | static Matrix rotMatY(const FLOAT &angle); 95 | static Matrix rotMatZ(const FLOAT &angle); 96 | 97 | // simple arithmetic operations 98 | Matrix operator+ (const Matrix &M); // add matrix 99 | Matrix operator- (const Matrix &M); // subtract matrix 100 | Matrix operator* (const Matrix &M); // multiply with matrix 101 | Matrix operator* (const FLOAT &s); // multiply with scalar 102 | Matrix operator/ (const Matrix &M); // divide elementwise by matrix (or vector) 103 | Matrix operator/ (const FLOAT &s); // divide by scalar 104 | Matrix operator- (); // negative matrix 105 | Matrix operator~ (); // transpose 106 | FLOAT l2norm (); // euclidean norm (vectors) / frobenius norm (matrices) 107 | FLOAT mean (); // mean of all elements in matrix 108 | 109 | // complex arithmetic operations 110 | static Matrix cross (const Matrix &a, const Matrix &b); // cross product of two vectors 111 | static Matrix inv (const Matrix &M); // invert matrix M 112 | bool inv (); // invert this matrix 113 | FLOAT det (); // returns determinant of matrix 114 | bool solve (const Matrix &M,FLOAT eps=1e-20); // solve linear system M*x=B, replaces *this and M 115 | bool lu(int32_t *idx, FLOAT &d, FLOAT eps=1e-20); // replace *this by lower upper decomposition 116 | void svd(Matrix &U,Matrix &W,Matrix &V); // singular value decomposition *this = U*diag(W)*V^T 117 | 118 | // print matrix to stream 119 | friend std::ostream& operator<< (std::ostream& out,const Matrix& M); 120 | 121 | // direct data access 122 | FLOAT **val; 123 | int32_t m,n; 124 | 125 | private: 126 | 127 | void allocateMemory (const int32_t m_,const int32_t n_); 128 | void releaseMemory (); 129 | inline FLOAT pythag(FLOAT a,FLOAT b); 130 | 131 | }; 132 | 133 | #endif // MATRIX_H 134 | -------------------------------------------------------------------------------- /CPU/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | #ifndef __TIMER_H__ 23 | #define __TIMER_H__ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | // Define fixed-width datatypes for Visual Studio projects 35 | #ifndef _MSC_VER 36 | #include 37 | #else 38 | typedef __int8 int8_t; 39 | typedef __int16 int16_t; 40 | typedef __int32 int32_t; 41 | typedef __int64 int64_t; 42 | typedef unsigned __int8 uint8_t; 43 | typedef unsigned __int16 uint16_t; 44 | typedef unsigned __int32 uint32_t; 45 | typedef unsigned __int64 uint64_t; 46 | #endif 47 | 48 | class Timer { 49 | 50 | public: 51 | 52 | Timer() {} 53 | 54 | ~Timer() {} 55 | 56 | void start (std::string title) { 57 | desc.push_back(title); 58 | push_back_time(); 59 | } 60 | 61 | void stop () { 62 | if (time.size()<=desc.size()) 63 | push_back_time(); 64 | } 65 | 66 | void plot () { 67 | stop(); 68 | float total_time = 0; 69 | for (int32_t i=0; i desc; 93 | std::vector time; 94 | 95 | void push_back_time () { 96 | timeval curr_time; 97 | gettimeofday(&curr_time,0); 98 | time.push_back(curr_time); 99 | } 100 | 101 | float getTimeDifferenceMilliseconds(timeval a,timeval b) { 102 | return ((float)(b.tv_sec -a.tv_sec ))*1e+3 + 103 | ((float)(b.tv_usec-a.tv_usec))*1e-3; 104 | } 105 | }; 106 | 107 | #endif 108 | -------------------------------------------------------------------------------- /CPU/triangle.h: -------------------------------------------------------------------------------- 1 | /*****************************************************************************/ 2 | /* */ 3 | /* (triangle.h) */ 4 | /* */ 5 | /* Include file for programs that call Triangle. */ 6 | /* */ 7 | /* Accompanies Triangle Version 1.6 */ 8 | /* July 28, 2005 */ 9 | /* */ 10 | /* Copyright 1996, 2005 */ 11 | /* Jonathan Richard Shewchuk */ 12 | /* 2360 Woolsey #H */ 13 | /* Berkeley, California 94705-1927 */ 14 | /* jrs@cs.berkeley.edu */ 15 | /* */ 16 | /* Modified by Andreas Geiger, 2011 */ 17 | /*****************************************************************************/ 18 | 19 | /*****************************************************************************/ 20 | /* */ 21 | /* How to call Triangle from another program */ 22 | /* */ 23 | /* */ 24 | /* If you haven't read Triangle's instructions (run "triangle -h" to read */ 25 | /* them), you won't understand what follows. */ 26 | /* */ 27 | /* Triangle must be compiled into an object file (triangle.o) with the */ 28 | /* TRILIBRARY symbol defined (generally by using the -DTRILIBRARY compiler */ 29 | /* switch). The makefile included with Triangle will do this for you if */ 30 | /* you run "make trilibrary". The resulting object file can be called via */ 31 | /* the procedure triangulate(). */ 32 | /* */ 33 | /* If the size of the object file is important to you, you may wish to */ 34 | /* generate a reduced version of triangle.o. The REDUCED symbol gets rid */ 35 | /* of all features that are primarily of research interest. Specifically, */ 36 | /* the -DREDUCED switch eliminates Triangle's -i, -F, -s, and -C switches. */ 37 | /* The CDT_ONLY symbol gets rid of all meshing algorithms above and beyond */ 38 | /* constrained Delaunay triangulation. Specifically, the -DCDT_ONLY switch */ 39 | /* eliminates Triangle's -r, -q, -a, -u, -D, -Y, -S, and -s switches. */ 40 | /* */ 41 | /* IMPORTANT: These definitions (TRILIBRARY, REDUCED, CDT_ONLY) must be */ 42 | /* made in the makefile or in triangle.c itself. Putting these definitions */ 43 | /* in this file (triangle.h) will not create the desired effect. */ 44 | /* */ 45 | /* */ 46 | /* The calling convention for triangulate() follows. */ 47 | /* */ 48 | /* void triangulate(triswitches, in, out, vorout) */ 49 | /* char *triswitches; */ 50 | /* struct triangulateio *in; */ 51 | /* struct triangulateio *out; */ 52 | /* struct triangulateio *vorout; */ 53 | /* */ 54 | /* `triswitches' is a string containing the command line switches you wish */ 55 | /* to invoke. No initial dash is required. Some suggestions: */ 56 | /* */ 57 | /* - You'll probably find it convenient to use the `z' switch so that */ 58 | /* points (and other items) are numbered from zero. This simplifies */ 59 | /* indexing, because the first item of any type always starts at index */ 60 | /* [0] of the corresponding array, whether that item's number is zero or */ 61 | /* one. */ 62 | /* - You'll probably want to use the `Q' (quiet) switch in your final code, */ 63 | /* but you can take advantage of Triangle's printed output (including the */ 64 | /* `V' switch) while debugging. */ 65 | /* - If you are not using the `q', `a', `u', `D', `j', or `s' switches, */ 66 | /* then the output points will be identical to the input points, except */ 67 | /* possibly for the boundary markers. If you don't need the boundary */ 68 | /* markers, you should use the `N' (no nodes output) switch to save */ 69 | /* memory. (If you do need boundary markers, but need to save memory, a */ 70 | /* good nasty trick is to set out->pointlist equal to in->pointlist */ 71 | /* before calling triangulate(), so that Triangle overwrites the input */ 72 | /* points with identical copies.) */ 73 | /* - The `I' (no iteration numbers) and `g' (.off file output) switches */ 74 | /* have no effect when Triangle is compiled with TRILIBRARY defined. */ 75 | /* */ 76 | /* `in', `out', and `vorout' are descriptions of the input, the output, */ 77 | /* and the Voronoi output. If the `v' (Voronoi output) switch is not used, */ 78 | /* `vorout' may be NULL. `in' and `out' may never be NULL. */ 79 | /* */ 80 | /* Certain fields of the input and output structures must be initialized, */ 81 | /* as described below. */ 82 | /* */ 83 | /*****************************************************************************/ 84 | 85 | /*****************************************************************************/ 86 | /* */ 87 | /* The `triangulateio' structure. */ 88 | /* */ 89 | /* Used to pass data into and out of the triangulate() procedure. */ 90 | /* */ 91 | /* */ 92 | /* Arrays are used to store points, triangles, markers, and so forth. In */ 93 | /* all cases, the first item in any array is stored starting at index [0]. */ 94 | /* However, that item is item number `1' unless the `z' switch is used, in */ 95 | /* which case it is item number `0'. Hence, you may find it easier to */ 96 | /* index points (and triangles in the neighbor list) if you use the `z' */ 97 | /* switch. Unless, of course, you're calling Triangle from a Fortran */ 98 | /* program. */ 99 | /* */ 100 | /* Description of fields (except the `numberof' fields, which are obvious): */ 101 | /* */ 102 | /* `pointlist': An array of point coordinates. The first point's x */ 103 | /* coordinate is at index [0] and its y coordinate at index [1], followed */ 104 | /* by the coordinates of the remaining points. Each point occupies two */ 105 | /* REALs. */ 106 | /* `pointattributelist': An array of point attributes. Each point's */ 107 | /* attributes occupy `numberofpointattributes' REALs. */ 108 | /* `pointmarkerlist': An array of point markers; one int per point. */ 109 | /* */ 110 | /* `trianglelist': An array of triangle corners. The first triangle's */ 111 | /* first corner is at index [0], followed by its other two corners in */ 112 | /* counterclockwise order, followed by any other nodes if the triangle */ 113 | /* represents a nonlinear element. Each triangle occupies */ 114 | /* `numberofcorners' ints. */ 115 | /* `triangleattributelist': An array of triangle attributes. Each */ 116 | /* triangle's attributes occupy `numberoftriangleattributes' REALs. */ 117 | /* `trianglearealist': An array of triangle area constraints; one REAL per */ 118 | /* triangle. Input only. */ 119 | /* `neighborlist': An array of triangle neighbors; three ints per */ 120 | /* triangle. Output only. */ 121 | /* */ 122 | /* `segmentlist': An array of segment endpoints. The first segment's */ 123 | /* endpoints are at indices [0] and [1], followed by the remaining */ 124 | /* segments. Two ints per segment. */ 125 | /* `segmentmarkerlist': An array of segment markers; one int per segment. */ 126 | /* */ 127 | /* `holelist': An array of holes. The first hole's x and y coordinates */ 128 | /* are at indices [0] and [1], followed by the remaining holes. Two */ 129 | /* REALs per hole. Input only, although the pointer is copied to the */ 130 | /* output structure for your convenience. */ 131 | /* */ 132 | /* `regionlist': An array of regional attributes and area constraints. */ 133 | /* The first constraint's x and y coordinates are at indices [0] and [1], */ 134 | /* followed by the regional attribute at index [2], followed by the */ 135 | /* maximum area at index [3], followed by the remaining area constraints. */ 136 | /* Four REALs per area constraint. Note that each regional attribute is */ 137 | /* used only if you select the `A' switch, and each area constraint is */ 138 | /* used only if you select the `a' switch (with no number following), but */ 139 | /* omitting one of these switches does not change the memory layout. */ 140 | /* Input only, although the pointer is copied to the output structure for */ 141 | /* your convenience. */ 142 | /* */ 143 | /* `edgelist': An array of edge endpoints. The first edge's endpoints are */ 144 | /* at indices [0] and [1], followed by the remaining edges. Two ints per */ 145 | /* edge. Output only. */ 146 | /* `edgemarkerlist': An array of edge markers; one int per edge. Output */ 147 | /* only. */ 148 | /* `normlist': An array of normal vectors, used for infinite rays in */ 149 | /* Voronoi diagrams. The first normal vector's x and y magnitudes are */ 150 | /* at indices [0] and [1], followed by the remaining vectors. For each */ 151 | /* finite edge in a Voronoi diagram, the normal vector written is the */ 152 | /* zero vector. Two REALs per edge. Output only. */ 153 | /* */ 154 | /* */ 155 | /* Any input fields that Triangle will examine must be initialized. */ 156 | /* Furthermore, for each output array that Triangle will write to, you */ 157 | /* must either provide space by setting the appropriate pointer to point */ 158 | /* to the space you want the data written to, or you must initialize the */ 159 | /* pointer to NULL, which tells Triangle to allocate space for the results. */ 160 | /* The latter option is preferable, because Triangle always knows exactly */ 161 | /* how much space to allocate. The former option is provided mainly for */ 162 | /* people who need to call Triangle from Fortran code, though it also makes */ 163 | /* possible some nasty space-saving tricks, like writing the output to the */ 164 | /* same arrays as the input. */ 165 | /* */ 166 | /* Triangle will not free() any input or output arrays, including those it */ 167 | /* allocates itself; that's up to you. You should free arrays allocated by */ 168 | /* Triangle by calling the trifree() procedure defined below. (By default, */ 169 | /* trifree() just calls the standard free() library procedure, but */ 170 | /* applications that call triangulate() may replace trimalloc() and */ 171 | /* trifree() in triangle.c to use specialized memory allocators.) */ 172 | /* */ 173 | /* Here's a guide to help you decide which fields you must initialize */ 174 | /* before you call triangulate(). */ 175 | /* */ 176 | /* `in': */ 177 | /* */ 178 | /* - `pointlist' must always point to a list of points; `numberofpoints' */ 179 | /* and `numberofpointattributes' must be properly set. */ 180 | /* `pointmarkerlist' must either be set to NULL (in which case all */ 181 | /* markers default to zero), or must point to a list of markers. If */ 182 | /* `numberofpointattributes' is not zero, `pointattributelist' must */ 183 | /* point to a list of point attributes. */ 184 | /* - If the `r' switch is used, `trianglelist' must point to a list of */ 185 | /* triangles, and `numberoftriangles', `numberofcorners', and */ 186 | /* `numberoftriangleattributes' must be properly set. If */ 187 | /* `numberoftriangleattributes' is not zero, `triangleattributelist' */ 188 | /* must point to a list of triangle attributes. If the `a' switch is */ 189 | /* used (with no number following), `trianglearealist' must point to a */ 190 | /* list of triangle area constraints. `neighborlist' may be ignored. */ 191 | /* - If the `p' switch is used, `segmentlist' must point to a list of */ 192 | /* segments, `numberofsegments' must be properly set, and */ 193 | /* `segmentmarkerlist' must either be set to NULL (in which case all */ 194 | /* markers default to zero), or must point to a list of markers. */ 195 | /* - If the `p' switch is used without the `r' switch, then */ 196 | /* `numberofholes' and `numberofregions' must be properly set. If */ 197 | /* `numberofholes' is not zero, `holelist' must point to a list of */ 198 | /* holes. If `numberofregions' is not zero, `regionlist' must point to */ 199 | /* a list of region constraints. */ 200 | /* - If the `p' switch is used, `holelist', `numberofholes', */ 201 | /* `regionlist', and `numberofregions' is copied to `out'. (You can */ 202 | /* nonetheless get away with not initializing them if the `r' switch is */ 203 | /* used.) */ 204 | /* - `edgelist', `edgemarkerlist', `normlist', and `numberofedges' may be */ 205 | /* ignored. */ 206 | /* */ 207 | /* `out': */ 208 | /* */ 209 | /* - `pointlist' must be initialized (NULL or pointing to memory) unless */ 210 | /* the `N' switch is used. `pointmarkerlist' must be initialized */ 211 | /* unless the `N' or `B' switch is used. If `N' is not used and */ 212 | /* `in->numberofpointattributes' is not zero, `pointattributelist' must */ 213 | /* be initialized. */ 214 | /* - `trianglelist' must be initialized unless the `E' switch is used. */ 215 | /* `neighborlist' must be initialized if the `n' switch is used. If */ 216 | /* the `E' switch is not used and (`in->numberofelementattributes' is */ 217 | /* not zero or the `A' switch is used), `elementattributelist' must be */ 218 | /* initialized. `trianglearealist' may be ignored. */ 219 | /* - `segmentlist' must be initialized if the `p' or `c' switch is used, */ 220 | /* and the `P' switch is not used. `segmentmarkerlist' must also be */ 221 | /* initialized under these circumstances unless the `B' switch is used. */ 222 | /* - `edgelist' must be initialized if the `e' switch is used. */ 223 | /* `edgemarkerlist' must be initialized if the `e' switch is used and */ 224 | /* the `B' switch is not. */ 225 | /* - `holelist', `regionlist', `normlist', and all scalars may be ignored.*/ 226 | /* */ 227 | /* `vorout' (only needed if `v' switch is used): */ 228 | /* */ 229 | /* - `pointlist' must be initialized. If `in->numberofpointattributes' */ 230 | /* is not zero, `pointattributelist' must be initialized. */ 231 | /* `pointmarkerlist' may be ignored. */ 232 | /* - `edgelist' and `normlist' must both be initialized. */ 233 | /* `edgemarkerlist' may be ignored. */ 234 | /* - Everything else may be ignored. */ 235 | /* */ 236 | /* After a call to triangulate(), the valid fields of `out' and `vorout' */ 237 | /* will depend, in an obvious way, on the choice of switches used. Note */ 238 | /* that when the `p' switch is used, the pointers `holelist' and */ 239 | /* `regionlist' are copied from `in' to `out', but no new space is */ 240 | /* allocated; be careful that you don't free() the same array twice. On */ 241 | /* the other hand, Triangle will never copy the `pointlist' pointer (or any */ 242 | /* others); new space is allocated for `out->pointlist', or if the `N' */ 243 | /* switch is used, `out->pointlist' remains uninitialized. */ 244 | /* */ 245 | /* All of the meaningful `numberof' fields will be properly set; for */ 246 | /* instance, `numberofedges' will represent the number of edges in the */ 247 | /* triangulation whether or not the edges were written. If segments are */ 248 | /* not used, `numberofsegments' will indicate the number of boundary edges. */ 249 | /* */ 250 | /*****************************************************************************/ 251 | 252 | struct triangulateio { 253 | float *pointlist; /* In / out */ 254 | float *pointattributelist; /* In / out */ 255 | int *pointmarkerlist; /* In / out */ 256 | int numberofpoints; /* In / out */ 257 | int numberofpointattributes; /* In / out */ 258 | 259 | int *trianglelist; /* In / out */ 260 | float *triangleattributelist; /* In / out */ 261 | float *trianglearealist; /* In only */ 262 | int *neighborlist; /* Out only */ 263 | int numberoftriangles; /* In / out */ 264 | int numberofcorners; /* In / out */ 265 | int numberoftriangleattributes; /* In / out */ 266 | 267 | int *segmentlist; /* In / out */ 268 | int *segmentmarkerlist; /* In / out */ 269 | int numberofsegments; /* In / out */ 270 | 271 | float *holelist; /* In / pointer to array copied out */ 272 | int numberofholes; /* In / copied out */ 273 | 274 | float *regionlist; /* In / pointer to array copied out */ 275 | int numberofregions; /* In / copied out */ 276 | 277 | int *edgelist; /* Out only */ 278 | int *edgemarkerlist; /* Not used with Voronoi diagram; out only */ 279 | float *normlist; /* Used only with Voronoi diagram; out only */ 280 | int numberofedges; /* Out only */ 281 | }; 282 | 283 | void triangulate(char *,triangulateio *,triangulateio *,triangulateio *); 284 | void trifree(int *memptr); 285 | 286 | -------------------------------------------------------------------------------- /GPU/RUN: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #SBATCH -p cpeg655 4 | #SBATCH -N 1 5 | #SBATCH --gres=gpu:1 6 | 7 | #SBATCH -J ngeneva 8 | #SBATCH -e my_job_%j.err 9 | #SBATCH -o my_job_%j.out 10 | 11 | srun main -------------------------------------------------------------------------------- /GPU/elas_gpu.cu: -------------------------------------------------------------------------------- 1 | #include "elas_gpu.h" 2 | 3 | using namespace std; 4 | 5 | __device__ uint32_t getAddressOffsetImage_GPU (const int32_t& u,const int32_t& v,const int32_t& width) { 6 | return v*width+u; 7 | } 8 | 9 | __device__ uint32_t getAddressOffsetGrid_GPU (const int32_t& x,const int32_t& y,const int32_t& d,const int32_t& width,const int32_t& disp_num) { 10 | return (y*width+x)*disp_num+d; 11 | } 12 | 13 | /** 14 | * CUDA Kernel for computing the match for a single UV coordinate 15 | */ 16 | __global__ void findMatch_GPU (int32_t* u_vals, int32_t* v_vals, int32_t size_total, float* planes_a, float* planes_b, float* planes_c, 17 | int32_t* disparity_grid, int32_t *grid_dims, uint8_t* I1_desc, uint8_t* I2_desc, 18 | int32_t* P, int32_t plane_radius, int32_t width ,int32_t height, bool* valids, bool right_image, float* D) { 19 | 20 | // get image width and height 21 | const int32_t disp_num = grid_dims[0]-1; 22 | const int32_t window_size = 2; 23 | 24 | //TODO: Remove hard code and use param 25 | bool subsampling = false; 26 | bool match_texture = true; 27 | int32_t grid_size = 20; 28 | 29 | // Pixel id 30 | uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x; 31 | 32 | // Check that we are in range 33 | if(idx >= size_total) 34 | return; 35 | 36 | // Else get our values from memory 37 | uint32_t u = u_vals[idx]; 38 | uint32_t v = v_vals[idx]; 39 | float plane_a = planes_a[idx]; 40 | float plane_b = planes_b[idx]; 41 | float plane_c = planes_c[idx]; 42 | bool valid = valids[idx]; 43 | 44 | // address of disparity we want to compute 45 | uint32_t d_addr; 46 | if (subsampling) d_addr = getAddressOffsetImage_GPU(u/2,v/2,width/2); 47 | else d_addr = getAddressOffsetImage_GPU(u,v,width); 48 | 49 | // check if u is ok 50 | if (u=width-window_size) 51 | return; 52 | 53 | // compute line start address 54 | int32_t line_offset = 16*width*max(min(v,height-3),2); 55 | uint8_t *I1_line_addr,*I2_line_addr; 56 | if (!right_image) { 57 | I1_line_addr = I1_desc+line_offset; 58 | I2_line_addr = I2_desc+line_offset; 59 | } else { 60 | I1_line_addr = I2_desc+line_offset; 61 | I2_line_addr = I1_desc+line_offset; 62 | } 63 | 64 | // compute I1 block start address 65 | uint8_t* I1_block_addr = I1_line_addr+16*u; 66 | 67 | // does this patch have enough texture? 68 | int32_t sum = 0; 69 | for (int32_t i=0; i<16; i++) 70 | sum += abs((int32_t)(*(I1_block_addr+i))-128); 71 | if (sumd_plane_max) { //If the current disparity is out of the planes range 95 | u_warp = u-d_curr+2*right_image*d_curr; //uwarp diffe 96 | if (u_warp=width-window_size) 97 | continue; 98 | u_warp = 16*u_warp; 99 | val = 0; 100 | for(int j=0; j<16; j++){ 101 | //val += abs((int32_t)(*(I1_block_addr+j))-(int32_t)(*(I2_line_addr+j+16*u_warp))); 102 | val = __sad((int)(*(I1_block_addr+j)),(int)(*(I2_line_addr+j+u_warp)),val); 103 | } 104 | 105 | if (val=width-window_size) 115 | continue; 116 | u_warp = 16*u_warp; 117 | val = 0; 118 | for(int j=0; j<16; j++){ 119 | //val += abs((int32_t)(*(I1_block_addr+j))-(int32_t)(*(I2_line_addr+j+16*u_warp))); 120 | val = __sad((int)(*(I1_block_addr+j)),(int)(*(I2_line_addr+j+u_warp)),val); 121 | } 122 | val += valid?*(P+abs(d_curr-d_plane)):0; 123 | if (val=0) *(D+d_addr) = min_d; // MAP value (min neg-Log probability) 131 | else *(D+d_addr) = -1; // invalid disparity 132 | } 133 | 134 | // implements approximation to 8x8 bilateral filtering 135 | __global__ void adaptiveMeanGPU8 (float* D, int32_t D_width, int32_t D_height) { 136 | 137 | // Global coordinates and Pixel id 138 | uint32_t u0 = blockDim.x*blockIdx.x + threadIdx.x + 4; 139 | uint32_t v0 = blockDim.y*blockIdx.y + threadIdx.y + 4; 140 | uint32_t idx = v0*D_width + u0; 141 | //Local thread coordinates 142 | uint32_t ut = threadIdx.x + 4; 143 | uint32_t vt = threadIdx.y + 4; 144 | 145 | //If out of filter range return instantly 146 | if(u0 > (D_width - 4) || v0 > (D_height - 4)) 147 | return; 148 | 149 | //Allocate Shared memory array with an appropiate margin for the bitlateral filter 150 | //Since we are using 8 pixels with the center pixel being 5, 151 | //we need 4 extra on left and top and 3 extra on right and bottom 152 | __shared__ float D_shared[32+7][32+7]; 153 | //Populate shared memory 154 | if(threadIdx.x == blockDim.x-1){ 155 | D_shared[ut+1][vt] = D[idx+1]; 156 | D_shared[ut+2][vt] = D[idx+2]; 157 | D_shared[ut+3][vt] = D[idx+3]; 158 | //D_shared[ut+4][vt] = D[idx+4]; 159 | } 160 | if(threadIdx.x == 0){ 161 | D_shared[ut-4][vt] = D[idx-4]; 162 | D_shared[ut-3][vt] = D[idx-3]; 163 | D_shared[ut-2][vt] = D[idx-2]; 164 | D_shared[ut-1][vt] = D[idx-1]; 165 | } 166 | if(threadIdx.y == 0){ 167 | D_shared[ut][vt-4] = D[(v0-4)*D_width+u0]; 168 | D_shared[ut][vt-3] = D[(v0-3)*D_width+u0]; 169 | D_shared[ut][vt-2] = D[(v0-2)*D_width+u0]; 170 | D_shared[ut][vt-1] = D[(v0-1)*D_width+u0]; 171 | } 172 | if(threadIdx.y == blockDim.y-1){ 173 | D_shared[ut][vt+1] = D[(v0+1)*D_width+u0]; 174 | D_shared[ut][vt+2] = D[(v0+2)*D_width+u0]; 175 | D_shared[ut][vt+3] = D[(v0+3)*D_width+u0]; 176 | //D_shared[ut][vt+4] = D[(v0+4)*D_width+u0]; 177 | } 178 | 179 | if(D[idx] < 0){ 180 | // zero input disparity maps to -10 (this makes the bilateral 181 | // weights of all valid disparities to 0 in this region) 182 | D_shared[ut][vt] = -10; 183 | }else{ 184 | D_shared[ut][vt] = D[idx]; 185 | } 186 | __syncthreads(); 187 | 188 | // full resolution: 8 pixel bilateral filter width 189 | // D(x) = sum(I(xi)*f(I(xi)-I(x))*g(xi-x))/W(x) 190 | // W(x) = sum(f(I(xi)-I(x))*g(xi-x)) 191 | // g(xi-x) = 1 192 | // f(I(xi)-I(x)) = 4-|I(xi)-I(x)| if greater than 0, 0 otherwise 193 | // horizontal filter 194 | 195 | // Current pixel being filtered is middle of our set (4 back, in orginal its 3 for some reason) 196 | //Note this isn't truely the center since original uses 8 vectore resisters 197 | float val_curr = D_shared[ut][vt]; 198 | 199 | float weight_sum0 = 0; 200 | float weight_sum = 0; 201 | float factor_sum = 0; 202 | 203 | for(int32_t i=0; i < 8; i++){ 204 | weight_sum0 = 4.0f - fabs(D_shared[ut+(i-4)][vt]-val_curr); 205 | weight_sum0 = max(0.0f, weight_sum0); 206 | weight_sum += weight_sum0; 207 | factor_sum += D_shared[ut+(i-4)][vt]*weight_sum0; 208 | } 209 | 210 | if (weight_sum>0) { 211 | float d = factor_sum/weight_sum; 212 | if (d>=0) *(D+idx) = d; 213 | } 214 | 215 | __syncthreads(); 216 | //Update shared memory 217 | if(threadIdx.x == blockDim.x-1){ 218 | D_shared[ut+1][vt] = D[idx+1]; 219 | D_shared[ut+2][vt] = D[idx+2]; 220 | D_shared[ut+3][vt] = D[idx+3]; 221 | //D_shared[ut+4][vt] = D[idx+4]; 222 | } 223 | if(threadIdx.x == 0){ 224 | D_shared[ut-4][vt] = D[idx-4]; 225 | D_shared[ut-3][vt] = D[idx-3]; 226 | D_shared[ut-2][vt] = D[idx-2]; 227 | D_shared[ut-1][vt] = D[idx-1]; 228 | } 229 | if(threadIdx.y == 0){ 230 | D_shared[ut][vt-4] = D[(v0-4)*D_width+u0]; 231 | D_shared[ut][vt-3] = D[(v0-3)*D_width+u0]; 232 | D_shared[ut][vt-2] = D[(v0-2)*D_width+u0]; 233 | D_shared[ut][vt-1] = D[(v0-1)*D_width+u0]; 234 | } 235 | if(threadIdx.y == blockDim.y-1){ 236 | D_shared[ut][vt+1] = D[(v0+1)*D_width+u0]; 237 | D_shared[ut][vt+2] = D[(v0+2)*D_width+u0]; 238 | D_shared[ut][vt+3] = D[(v0+3)*D_width+u0]; 239 | //D_shared[ut][vt+4] = D[(v0+4)*D_width+u0]; 240 | } 241 | 242 | if(D[idx] < 0){ 243 | D_shared[ut][vt] = -10; 244 | }else{ 245 | D_shared[ut][vt] = D[idx]; 246 | } 247 | 248 | __syncthreads(); 249 | 250 | // vertical filter 251 | // set pixel of interest 252 | val_curr = D_shared[ut][vt]; 253 | 254 | weight_sum0 = 0; 255 | weight_sum = 0; 256 | factor_sum = 0; 257 | 258 | for(int32_t i=0; i < 8; i++){ 259 | weight_sum0 = 4.0f - fabs(D_shared[ut][vt+(i-4)]-val_curr); 260 | weight_sum0 = max(0.0f, weight_sum0); 261 | weight_sum += weight_sum0; 262 | factor_sum += D_shared[ut][vt+(i-4)]*weight_sum0; 263 | } 264 | 265 | if (weight_sum>0) { 266 | float d = factor_sum/weight_sum; 267 | if (d>=0) *(D+idx) = d; 268 | } 269 | 270 | } 271 | 272 | /** 273 | * This is the core method that computes the disparity of the image 274 | * It processes each triangle, so we create a kernel and have each thread 275 | * compute the matches in each triangle 276 | */ 277 | void ElasGPU::computeDisparity(std::vector p_support, std::vector tri, int32_t* disparity_grid, int32_t *grid_dims, 278 | uint8_t* I1_desc, uint8_t* I2_desc, bool right_image, float* D) { 279 | 280 | // number of disparities 281 | const int32_t disp_num = grid_dims[0]-1; 282 | 283 | // descriptor window_size 284 | int32_t window_size = 2; 285 | 286 | // init disparity image to -10 287 | if (param.subsampling) { 288 | for (int32_t i=0; i<(width/2)*(height/2); i++) 289 | *(D+i) = -10; 290 | } else { 291 | for (int32_t i=0; itri_u[j]) { 358 | float tri_u_temp = tri_u[j]; tri_u[j] = tri_u[k]; tri_u[k] = tri_u_temp; 359 | float tri_v_temp = tri_v[j]; tri_v[j] = tri_v[k]; tri_v[k] = tri_v_temp; 360 | } 361 | } 362 | } 363 | 364 | // rename corners 365 | float A_u = tri_u[0]; float A_v = tri_v[0]; 366 | float B_u = tri_u[1]; float B_v = tri_v[1]; 367 | float C_u = tri_u[2]; float C_v = tri_v[2]; 368 | 369 | // compute straight lines connecting triangle corners 370 | float AB_a = 0; float AC_a = 0; float BC_a = 0; 371 | if ((int32_t)(A_u)!=(int32_t)(B_u)) AB_a = (A_v-B_v)/(A_u-B_u); 372 | if ((int32_t)(A_u)!=(int32_t)(C_u)) AC_a = (A_v-C_v)/(A_u-C_u); 373 | if ((int32_t)(B_u)!=(int32_t)(C_u)) BC_a = (B_v-C_v)/(B_u-C_u); 374 | float AB_b = A_v-AB_a*A_u; 375 | float AC_b = A_v-AC_a*A_u; 376 | float BC_b = B_v-BC_a*B_u; 377 | 378 | // a plane is only valid if itself and its projection 379 | // into the other image is not too much slanted 380 | bool valid = fabs(plane_a)<0.7 && fabs(plane_d)<0.7; 381 | 382 | // Vector of all u,v pairs we need to calculate 383 | std::vector temp_val_u = std::vector(); 384 | std::vector temp_val_v = std::vector(); 385 | 386 | // first part (triangle corner A->B) 387 | if ((int32_t)(A_u)!=(int32_t)(B_u)) { 388 | // Starting at A_u loop till the B_u or the end of the image 389 | for (int32_t u=max((int32_t)A_u,0); uC) 407 | if ((int32_t)(B_u)!=(int32_t)(C_u)) { 408 | for (int32_t u=max((int32_t)B_u,0); u>>(d_u_vals, d_v_vals, size_total, d_planes_a, d_planes_b, d_planes_c, 508 | d_disparity_grid, d_grid_dims, d_I1, d_I2, d_P, plane_radius, 509 | width, height, d_valids, right_image, d_D); 510 | 511 | // Sync after the kernel is launched 512 | cudaDeviceSynchronize(); 513 | 514 | // Copy the final disparity values back over 515 | cudaMemcpy(D, d_D, width*height*sizeof(float), cudaMemcpyDeviceToHost); 516 | 517 | // Free local memory 518 | delete[] P; 519 | 520 | // Delete host code 521 | delete planes_a; 522 | delete planes_b; 523 | delete planes_c; 524 | delete pixs_u; 525 | delete pixs_v; 526 | delete valids; 527 | 528 | // Free big memory 529 | cudaFree(d_u_vals); 530 | cudaFree(d_v_vals); 531 | cudaFree(d_planes_a); 532 | cudaFree(d_planes_b); 533 | cudaFree(d_planes_c); 534 | 535 | // Free cuda memory 536 | cudaFree(d_disparity_grid); 537 | cudaFree(d_P); 538 | cudaFree(d_D); 539 | cudaFree(d_I1); 540 | cudaFree(d_I2); 541 | cudaFree(d_grid_dims); 542 | cudaFree(d_u_vals); 543 | cudaFree(d_v_vals); 544 | 545 | } 546 | 547 | // implements approximation to bilateral filtering 548 | void ElasGPU::adaptiveMean (float* D) { 549 | 550 | // get disparity image dimensions 551 | int32_t D_width = width; 552 | int32_t D_height = height; 553 | if (param.subsampling) { 554 | D_width = width/2; 555 | D_height = height/2; 556 | } 557 | 558 | // allocate temporary memory 559 | float* D_copy = (float*)malloc(D_width*D_height*sizeof(float)); 560 | float* D_tmp = (float*)malloc(D_width*D_height*sizeof(float)); 561 | memcpy(D_copy,D,D_width*D_height*sizeof(float)); 562 | 563 | // zero input disparity maps to -10 (this makes the bilateral 564 | // weights of all valid disparities to 0 in this region) 565 | for (int32_t i=0; i0) { 614 | float d = factor_sum/weight_sum; 615 | if (d>=0) *(D_tmp+v*D_width+(u-1)) = d; 616 | } 617 | } 618 | } 619 | 620 | // vertical filter 621 | for (int32_t u=3; u0) { 648 | float d = factor_sum/weight_sum; 649 | if (d>=0) *(D+(v-1)*D_width+u) = d; 650 | } 651 | } 652 | } 653 | 654 | // full resolution: 8 pixel bilateral filter width 655 | // D(x) = sum(I(x)*f(I(xi)-I(x))*g(xi-x))/W(x) 656 | // W(x) = sum(f(I(xi)-I(x))*g(xi-x)) 657 | // g(xi-x) = 1 658 | // f(I(xi)-I(x)) = 1-(I(xi)-I(x)) if greater than 0, 0 otherwise 659 | } else { 660 | 661 | // Calculate size of kernel 662 | int block_width = 8; 663 | int block_height = block_width; 664 | int grid_width, grid_height; 665 | 666 | //Calculate grid_size 667 | if((width-8)%block_width == 0) { 668 | grid_width = ceil(width/block_width); 669 | } else { 670 | grid_width = ceil(width/block_width); + 1; 671 | } 672 | 673 | if((height-8)%block_height == 0) { 674 | grid_height = ceil(height/block_height); 675 | } else { 676 | grid_height = ceil(height/block_height); + 1; 677 | } 678 | 679 | // Create size objects 680 | dim3 DimGrid(grid_width,grid_height,1); 681 | dim3 DimBlock(block_width,block_height,1); 682 | 683 | // CUDA copy over needed memory information 684 | // disparity_grid and respective copies 685 | float* d_D; 686 | 687 | // Allocate on global memory and copy 688 | cudaMalloc((void**) &d_D, width*height*sizeof(float)); 689 | cudaMemcpy(d_D, D, width*height*sizeof(float), cudaMemcpyHostToDevice); 690 | 691 | //Kernel go! 692 | adaptiveMeanGPU8<<>>(d_D, width, height); 693 | 694 | // Sync after the kernel is launched 695 | cudaDeviceSynchronize(); 696 | 697 | // Copy the final disparity values back over 698 | cudaMemcpy(D, d_D, width*height*sizeof(float), cudaMemcpyDeviceToHost); 699 | 700 | //Free memory 701 | cudaFree(d_D); 702 | 703 | 704 | // horizontal filter 705 | /*for (int32_t v=3; v0) { 732 | float d = factor_sum2/weight_sum2; 733 | if (d>=0) *(D_tmp+v*D_width+(u-3)) = d; 734 | } 735 | } 736 | } 737 | 738 | // vertical filter 739 | for (int32_t u=3; u0) { 764 | float d = factor_sum2/weight_sum2; 765 | if (d>=0) *(D+(v-3)*D_width+u) = d; 766 | } 767 | } 768 | }*/ 769 | } 770 | 771 | // free memory 772 | _mm_free(val); 773 | _mm_free(weight); 774 | _mm_free(factor); 775 | free(D_copy); 776 | free(D_tmp); 777 | } -------------------------------------------------------------------------------- /GPU/elas_gpu.h: -------------------------------------------------------------------------------- 1 | #ifndef __ELAS_GPU_H__ 2 | #define __ELAS_GPU_H__ 3 | 4 | // Enable profiling 5 | #define PROFILE 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "elas.h" 15 | #include "descriptor.h" 16 | #include "triangle.h" 17 | #include "matrix.h" 18 | 19 | 20 | /** 21 | * Our ElasGPU class with all cuda implementations 22 | * Note where we extend the Elas class so we are calling 23 | * On all non-gpu functions there if they are not implemented 24 | */ 25 | class ElasGPU : public Elas { 26 | 27 | public: 28 | 29 | // Constructor, input: parameters 30 | // Pass this to the super constructor 31 | ElasGPU(parameters param) : Elas(param) {} 32 | 33 | // This was originally "private" 34 | // Was converted to allow sub-classes to call this 35 | // This assumes the user knows what they are doing 36 | public: 37 | 38 | void computeDisparity(std::vector p_support,std::vector tri,int32_t* disparity_grid,int32_t *grid_dims, 39 | uint8_t* I1_desc,uint8_t* I2_desc,bool right_image,float* D); 40 | 41 | void adaptiveMean (float* D); 42 | 43 | }; 44 | 45 | 46 | #endif //__ELAS_GPU_H__ 47 | -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/aloe_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/aloe_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/aloe_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/aloe_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/cones_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/cones_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/cones_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/cones_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/raindeer_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/raindeer_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/raindeer_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/raindeer_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban1_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban1_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban1_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban1_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban2_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban2_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban2_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban2_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban3_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban3_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban3_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban3_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban4_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban4_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_cpu/urban4_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban4_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/aloe_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/aloe_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/aloe_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/aloe_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/cones_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/cones_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/cones_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/cones_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/raindeer_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/raindeer_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/raindeer_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/raindeer_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban1_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban1_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban1_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban1_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban2_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban2_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban2_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban2_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban3_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban3_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban3_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban3_right_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban4_left_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban4_left_disp.pgm -------------------------------------------------------------------------------- /GPU_test/2016_12_06_gpu/urban4_right_disp.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban4_right_disp.pgm -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Nicholas Geneva 4 | Copyright (c) 2016 Patrick Geneva 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # libelas-gpu 2 | 3 | This is an implementation of the well known [libelas](http://www.cvlibs.net/software/libelas/) 4 | (LIBrary for Efficient LArge-scale Stereo matching) library for 5 | sparse large real-time calculation of stereo disparity images. 6 | This is for a college course final project and thus does not have 7 | any support of any kind. The original source code can be found 8 | in the CPU directory. This has been commented as we explore the code. 9 | 10 | 11 | From there, the GPU cuda implementation of key methods can be found 12 | in the GPU folder. For methods not implemented on the GPU the CPU 13 | version is called, and such code is run on the CPU. 14 | 15 | ## Differences 16 | 17 | The key difference is the changing of most of the methods in the ELAS class 18 | to being virtual. This allows for the sub-classes to override such methods. 19 | This allows for seamless transition between the GPU and CPU code. Additionally, 20 | the methods where converted to public to allow for the testing and comparison of the two methods directly. 21 | 22 | ## Building 23 | 24 | * First create a build directory `mkdir build` 25 | * Move into this directory `cd build` 26 | * Run the c++ [cmake](https://cmake.org/) build system `cmake ..` 27 | * Finally build all the executables `make -j5` 28 | * To build a single one preform the following `make -j5 ` 29 | * To run the program do the following 30 | - `./libelas_cpu demo` 31 | - `./libelas_gpu` 32 | 33 | 34 | ## Licenses 35 | 36 | All the original code is licensed under the **GNU General Public License**. This can be found with the original [libelas](http://www.cvlibs.net/software/libelas/) library. All other code is licensed under the MIT license, which is attached to this repo's LICENSE file. 37 | -------------------------------------------------------------------------------- /input/aloe_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/aloe_left.pgm -------------------------------------------------------------------------------- /input/aloe_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/aloe_right.pgm -------------------------------------------------------------------------------- /input/cones_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/cones_left.pgm -------------------------------------------------------------------------------- /input/cones_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/cones_right.pgm -------------------------------------------------------------------------------- /input/raindeer_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/raindeer_left.pgm -------------------------------------------------------------------------------- /input/raindeer_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/raindeer_right.pgm -------------------------------------------------------------------------------- /input/urban1_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban1_left.pgm -------------------------------------------------------------------------------- /input/urban1_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban1_right.pgm -------------------------------------------------------------------------------- /input/urban2_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban2_left.pgm -------------------------------------------------------------------------------- /input/urban2_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban2_right.pgm -------------------------------------------------------------------------------- /input/urban3_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban3_left.pgm -------------------------------------------------------------------------------- /input/urban3_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban3_right.pgm -------------------------------------------------------------------------------- /input/urban4_left.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban4_left.pgm -------------------------------------------------------------------------------- /input/urban4_right.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban4_right.pgm -------------------------------------------------------------------------------- /main_cpu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2011. All rights reserved. 3 | Institute of Measurement and Control Systems 4 | Karlsruhe Institute of Technology, Germany 5 | 6 | This file is part of libelas. 7 | Authors: Andreas Geiger 8 | 9 | libelas is free software; you can redistribute it and/or modify it under the 10 | terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 3 of the License, or any later version. 12 | 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along with 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 20 | */ 21 | 22 | // Demo program showing how libelas can be used, try "./elas -h" for help 23 | #include 24 | #include "elas.h" 25 | #include "image.h" 26 | 27 | using namespace std; 28 | 29 | 30 | // compute disparities of pgm image input pair file_1, file_2 31 | void process (const char* file_1,const char* file_2) { 32 | 33 | cout << "Processing: " << file_1 << ", " << file_2 << endl; 34 | 35 | // load images 36 | image *I1,*I2; 37 | I1 = loadPGM(file_1); 38 | I2 = loadPGM(file_2); 39 | 40 | // check for correct size 41 | if (I1->width()<=0 || I1->height() <=0 || I2->width()<=0 || I2->height() <=0 || 42 | I1->width()!=I2->width() || I1->height()!=I2->height()) { 43 | cout << "ERROR: Images must be of same size, but" << endl; 44 | cout << " I1: " << I1->width() << " x " << I1->height() << 45 | ", I2: " << I2->width() << " x " << I2->height() << endl; 46 | delete I1; 47 | delete I2; 48 | return; 49 | } 50 | 51 | // get image width and height 52 | int32_t width = I1->width(); 53 | int32_t height = I1->height(); 54 | 55 | // allocate memory for disparity images 56 | const int32_t dims[3] = {width,height,width}; // bytes per line = width 57 | float* D1_data = (float*)malloc(width*height*sizeof(float)); 58 | float* D2_data = (float*)malloc(width*height*sizeof(float)); 59 | 60 | // process 61 | Elas::parameters param; 62 | param.postprocess_only_left = false; 63 | Elas elas(param); 64 | elas.process(I1->data,I2->data,D1_data,D2_data,dims); 65 | 66 | // find maximum disparity for scaling output disparity images to [0..255] 67 | float disp_max = 0; 68 | for (int32_t i=0; idisp_max) disp_max = D1_data[i]; 70 | if (D2_data[i]>disp_max) disp_max = D2_data[i]; 71 | } 72 | 73 | // copy float to uchar 74 | image *D1 = new image(width,height); 75 | image *D2 = new image(width,height); 76 | for (int32_t i=0; idata[i] = (uint8_t)max(255.0*D1_data[i]/disp_max,0.0); 78 | D2->data[i] = (uint8_t)max(255.0*D2_data[i]/disp_max,0.0); 79 | } 80 | 81 | // save disparity images 82 | char output_1[1024]; 83 | char output_2[1024]; 84 | strncpy(output_1,file_1,strlen(file_1)-4); 85 | strncpy(output_2,file_2,strlen(file_2)-4); 86 | output_1[strlen(file_1)-4] = '\0'; 87 | output_2[strlen(file_2)-4] = '\0'; 88 | strcat(output_1,"_disp.pgm"); 89 | strcat(output_2,"_disp.pgm"); 90 | savePGM(D1,output_1); 91 | savePGM(D2,output_2); 92 | 93 | // free memory 94 | delete I1; 95 | delete I2; 96 | delete D1; 97 | delete D2; 98 | free(D1_data); 99 | free(D2_data); 100 | } 101 | 102 | int main (int argc, char** argv) { 103 | 104 | // Run the demo 105 | // Assume we are running from sub-folder 106 | if (argc==2 && !strcmp(argv[1],"demo")) { 107 | process("../input/cones_left.pgm", "../input/cones_right.pgm"); 108 | process("../input/aloe_left.pgm", "../input/aloe_right.pgm"); 109 | process("../input/raindeer_left.pgm","../input/raindeer_right.pgm"); 110 | process("../input/urban1_left.pgm", "../input/urban1_right.pgm"); 111 | process("../input/urban2_left.pgm", "../input/urban2_right.pgm"); 112 | process("../input/urban3_left.pgm", "../input/urban3_right.pgm"); 113 | process("../input/urban4_left.pgm", "../input/urban4_right.pgm"); 114 | cout << "... done!" << endl; 115 | 116 | // compute disparity from input pair 117 | } else if (argc==3) { 118 | process(argv[1],argv[2]); 119 | cout << "... done!" << endl; 120 | 121 | // display help 122 | } else { 123 | cout << endl; 124 | cout << "ELAS demo program usage: " << endl; 125 | cout << "./elas demo ................ process all test images (image dir)" << endl; 126 | cout << "./elas left.pgm right.pgm .. process a single stereo pair" << endl; 127 | cout << "./elas -h .................. shows this help" << endl; 128 | cout << endl; 129 | cout << "Note: All images must be pgm greylevel images. All output" << endl; 130 | cout << " disparities will be scaled such that disp_max = 255." << endl; 131 | cout << endl; 132 | } 133 | 134 | return 0; 135 | } 136 | 137 | 138 | -------------------------------------------------------------------------------- /main_gpu.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include "elas.h" 4 | #include "elas_gpu.h" 5 | #include "image.h" 6 | 7 | using namespace std; 8 | 9 | 10 | // Global functions 11 | void process (const char* file_1,const char* file_2); 12 | 13 | // Enable profiling 14 | #define PROFILE 15 | 16 | int main(int argc, char** argv) { 17 | 18 | // Startup the GPU device 19 | // https://devtalk.nvidia.com/default/topic/895513/cuda-programming-and-performance/cudamalloc-slow/post/4724457/#4724457 20 | cudaFree(0); 21 | 22 | // Process example frames 23 | process("../input/cones_left.pgm", "../input/cones_right.pgm"); 24 | process("../input/aloe_left.pgm", "../input/aloe_right.pgm"); 25 | process("../input/raindeer_left.pgm","../input/raindeer_right.pgm"); 26 | process("../input/urban1_left.pgm", "../input/urban1_right.pgm"); 27 | process("../input/urban2_left.pgm", "../input/urban2_right.pgm"); 28 | process("../input/urban3_left.pgm", "../input/urban3_right.pgm"); 29 | process("../input/urban4_left.pgm", "../input/urban4_right.pgm"); 30 | cout << "... done!" << endl; 31 | 32 | // Done! 33 | return EXIT_SUCCESS; 34 | } 35 | 36 | 37 | 38 | /** 39 | * Compute disparities of pgm image input pair file_1, file_2 40 | */ 41 | void process (const char* file_1,const char* file_2) { 42 | 43 | cout << "Processing: " << file_1 << ", " << file_2 << endl; 44 | 45 | // load images 46 | image *I1,*I2; 47 | I1 = loadPGM(file_1); 48 | I2 = loadPGM(file_2); 49 | 50 | // check for correct size 51 | if (I1->width()<=0 || I1->height() <=0 || I2->width()<=0 || I2->height() <=0 || 52 | I1->width()!=I2->width() || I1->height()!=I2->height()) { 53 | cout << "ERROR: Images must be of same size, but" << endl; 54 | cout << " I1: " << I1->width() << " x " << I1->height() << 55 | ", I2: " << I2->width() << " x " << I2->height() << endl; 56 | delete I1; 57 | delete I2; 58 | return; 59 | } 60 | 61 | // get image width and height 62 | int32_t width = I1->width(); 63 | int32_t height = I1->height(); 64 | 65 | // allocate memory for disparity images 66 | const int32_t dims[3] = {width,height,width}; // bytes per line = width 67 | float* D1_data = (float*)malloc(width*height*sizeof(float)); 68 | float* D2_data = (float*)malloc(width*height*sizeof(float)); 69 | 70 | // process 71 | Elas::parameters param; 72 | param.postprocess_only_left = false; 73 | //param.subsampling = true; 74 | ElasGPU elas(param); 75 | elas.process(I1->data,I2->data,D1_data,D2_data,dims); 76 | 77 | // find maximum disparity for scaling output disparity images to [0..255] 78 | float disp_max = 0; 79 | for (int32_t i=0; idisp_max) disp_max = D1_data[i]; 81 | if (D2_data[i]>disp_max) disp_max = D2_data[i]; 82 | } 83 | 84 | // copy float to uchar 85 | image *D1 = new image(width,height); 86 | image *D2 = new image(width,height); 87 | for (int32_t i=0; idata[i] = (uint8_t)max(255.0*D1_data[i]/disp_max,0.0); 89 | D2->data[i] = (uint8_t)max(255.0*D2_data[i]/disp_max,0.0); 90 | } 91 | 92 | // save disparity images 93 | char output_1[1024]; 94 | char output_2[1024]; 95 | strncpy(output_1,file_1,strlen(file_1)-4); 96 | strncpy(output_2,file_2,strlen(file_2)-4); 97 | output_1[strlen(file_1)-4] = '\0'; 98 | output_2[strlen(file_2)-4] = '\0'; 99 | strcat(output_1,"_disp.pgm"); 100 | strcat(output_2,"_disp.pgm"); 101 | savePGM(D1,output_1); 102 | savePGM(D2,output_2); 103 | 104 | // free memory 105 | delete I1; 106 | delete I2; 107 | delete D1; 108 | delete D2; 109 | free(D1_data); 110 | free(D2_data); 111 | } -------------------------------------------------------------------------------- /main_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "image.h" 4 | 5 | using namespace std; 6 | 7 | 8 | // compute disparities of pgm image input pair file_1, file_2 9 | void process (const char* file_1,const char* file_2) { 10 | 11 | cout << "Processing: " << file_1 << ", " << file_2 << endl; 12 | 13 | // load images 14 | image *I1,*I2; 15 | I1 = loadPGM(file_1); 16 | I2 = loadPGM(file_2); 17 | 18 | // check for correct size 19 | if (I1->width()<=0 || I1->height() <=0 || I2->width()<=0 || I2->height() <=0 || 20 | I1->width()!=I2->width() || I1->height()!=I2->height()) { 21 | cout << "ERROR: Images must be of same size, but" << endl; 22 | cout << " I1: " << I1->width() << " x " << I1->height() << 23 | ", I2: " << I2->width() << " x " << I2->height() << endl; 24 | delete I1; 25 | delete I2; 26 | return; 27 | } 28 | 29 | // Get image width and height 30 | int32_t width = I1->width(); 31 | int32_t height = I1->height(); 32 | 33 | // Allocate the calculation image matrix 34 | //image *I3(width, height, true); 35 | 36 | // Variable for the total error 37 | double sse = 0; 38 | 39 | // Compute the mean squared error between the two images 40 | // http://stackoverflow.com/a/17237076 41 | for(int32_t i=0; i " << endl; 88 | } 89 | 90 | return 0; 91 | } 92 | 93 | 94 | -------------------------------------------------------------------------------- /references/2010ACCV_Geiger.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/references/2010ACCV_Geiger.pdf -------------------------------------------------------------------------------- /references/2016IROS_Maddern.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/references/2016IROS_Maddern.pdf -------------------------------------------------------------------------------- /references/StereoNotes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/references/StereoNotes.pdf -------------------------------------------------------------------------------- /reports/2016_ELEG655_project_presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/2016_ELEG655_project_presentation.pdf -------------------------------------------------------------------------------- /reports/2016_ELEG655_project_propsal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/2016_ELEG655_project_propsal.pdf -------------------------------------------------------------------------------- /reports/2016_ELEG655_project_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/2016_ELEG655_project_report.pdf -------------------------------------------------------------------------------- /reports/project.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/project.pdf --------------------------------------------------------------------------------