├── .gitignore
├── CMakeLists.txt
├── CPU
    ├── descriptor.cpp
    ├── descriptor.h
    ├── elas.cpp
    ├── elas.h
    ├── filter.cpp
    ├── filter.h
    ├── image.h
    ├── matrix.cpp
    ├── matrix.h
    ├── timer.h
    ├── triangle.cpp
    └── triangle.h
├── GPU
    ├── RUN
    ├── elas_gpu.cu
    └── elas_gpu.h
├── GPU_test
    ├── 2016_12_06_cpu
    │   ├── aloe_left_disp.pgm
    │   ├── aloe_right_disp.pgm
    │   ├── cones_left_disp.pgm
    │   ├── cones_right_disp.pgm
    │   ├── raindeer_left_disp.pgm
    │   ├── raindeer_right_disp.pgm
    │   ├── urban1_left_disp.pgm
    │   ├── urban1_right_disp.pgm
    │   ├── urban2_left_disp.pgm
    │   ├── urban2_right_disp.pgm
    │   ├── urban3_left_disp.pgm
    │   ├── urban3_right_disp.pgm
    │   ├── urban4_left_disp.pgm
    │   └── urban4_right_disp.pgm
    └── 2016_12_06_gpu
    │   ├── aloe_left_disp.pgm
    │   ├── aloe_right_disp.pgm
    │   ├── cones_left_disp.pgm
    │   ├── cones_right_disp.pgm
    │   ├── raindeer_left_disp.pgm
    │   ├── raindeer_right_disp.pgm
    │   ├── urban1_left_disp.pgm
    │   ├── urban1_right_disp.pgm
    │   ├── urban2_left_disp.pgm
    │   ├── urban2_right_disp.pgm
    │   ├── urban3_left_disp.pgm
    │   ├── urban3_right_disp.pgm
    │   ├── urban4_left_disp.pgm
    │   └── urban4_right_disp.pgm
├── LICENSE
├── ReadMe.md
├── input
    ├── aloe_left.pgm
    ├── aloe_right.pgm
    ├── cones_left.pgm
    ├── cones_right.pgm
    ├── raindeer_left.pgm
    ├── raindeer_right.pgm
    ├── urban1_left.pgm
    ├── urban1_right.pgm
    ├── urban2_left.pgm
    ├── urban2_right.pgm
    ├── urban3_left.pgm
    ├── urban3_right.pgm
    ├── urban4_left.pgm
    └── urban4_right.pgm
├── main_cpu.cpp
├── main_gpu.cu
├── main_test.cpp
├── references
    ├── 2010ACCV_Geiger.pdf
    ├── 2016IROS_Maddern.pdf
    └── StereoNotes.pdf
└── reports
    ├── 2016_ELEG655_project_presentation.pdf
    ├── 2016_ELEG655_project_propsal.pdf
    ├── 2016_ELEG655_project_report.pdf
    └── project.pdf


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | output/*
3 | !output/.gitkeep
4 | .vscode
5 | input/*_disp.pgm


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.8)
 2 | 
 3 | # Project name
 4 | project(libelas-gpu)
 5 | 
 6 | # Include our cmake files
 7 | # SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/)
 8 | 
 9 | # Include libraries
10 | find_package(CUDA)
11 | # find_package(Eigen3 REQUIRED)
12 | # find_package(Boost REQUIRED COMPONENTS system serialization system filesystem thread program_options date_time regex timer chrono)
13 | # find_package(OpenCV 3 REQUIRED core imgcodecs videoio ximgproc)
14 | 
15 | 
16 | # Try to compile with c++11
17 | # http://stackoverflow.com/a/25836953
18 | include(CheckCXXCompilerFlag)
19 | CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11)
20 | CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X)
21 | if(COMPILER_SUPPORTS_CXX11)
22 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
23 | elseif(COMPILER_SUPPORTS_CXX0X)
24 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
25 | else()
26 |     message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.")
27 | endif()
28 | 
29 | # Enable compile optimizations
30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
31 | 
32 | # Enable debug flags
33 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g3  -Wall")
34 | 
35 | # Enable SSE3 cpu commands
36 | # If we are on ARM we need to find an alternative
37 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3")
38 | 
39 | # Set our nvcc flags
40 | # http://stackoverflow.com/a/13244930
41 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch sm_20 -gencode arch=compute_20,code=sm_20)
42 | 
43 | 
44 | # Include our header files
45 | include_directories(CPU GPU GPU_test)
46 | 
47 | # Include all source files in each sub-directory
48 | # file(GLOB_RECURSE sources_cpu CPU/*.cpp)
49 | # file(GLOB_RECURSE sources_gpu GPU/*.cu)
50 | # file(GLOB_RECURSE sources_cpu GPU_test/*.cpp)
51 | 
52 | 
53 | # Set files the build
54 | set(sources_cpu
55 |   CPU/descriptor.cpp
56 |   CPU/elas.cpp
57 |   CPU/filter.cpp
58 |   CPU/matrix.cpp
59 |   CPU/triangle.cpp
60 | )
61 | 
62 | # Set files the build
63 | set(sources_gpu
64 |   GPU/elas_gpu.cu
65 | )
66 | 
67 | 
68 | # CPU binary
69 | add_executable(libelas_cpu main_cpu.cpp ${sources_cpu})
70 | 
71 | # GPU binary, convert the gpu code to cpu code, then build
72 | cuda_compile(sources_gpu_built main_gpu.cu ${sources_gpu})
73 | cuda_add_executable(libelas_gpu ${sources_gpu_built} ${sources_cpu})
74 | 
75 | # Testing binary
76 | add_executable(libelas_test main_test.cpp ${sources_cpu})
77 | 


--------------------------------------------------------------------------------
/CPU/descriptor.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | #include "descriptor.h"
 23 | #include "filter.h"
 24 | #include <emmintrin.h>
 25 | 
 26 | using namespace std;
 27 | 
 28 | Descriptor::Descriptor(uint8_t* I,int32_t width,int32_t height,int32_t bpl,bool half_resolution) {
 29 |   I_desc        = (uint8_t*)_mm_malloc(16*width*height*sizeof(uint8_t),16);
 30 |   uint8_t* I_du = (uint8_t*)_mm_malloc(bpl*height*sizeof(uint8_t),16);
 31 |   uint8_t* I_dv = (uint8_t*)_mm_malloc(bpl*height*sizeof(uint8_t),16);
 32 |   //Filter call so sobel filter to get lines better
 33 |   filter::sobel3x3(I,I_du,I_dv,bpl,height);
 34 |   //Create 16 byte discriptors for each deep image pixel
 35 |   createDescriptor(I_du,I_dv,width,height,bpl,half_resolution);
 36 |   _mm_free(I_du);
 37 |   _mm_free(I_dv);
 38 | }
 39 | 
 40 | Descriptor::~Descriptor() {
 41 |   _mm_free(I_desc);
 42 | }
 43 | 
 44 | void Descriptor::createDescriptor (uint8_t* I_du,uint8_t* I_dv,int32_t width,int32_t height,int32_t bpl,bool half_resolution) {
 45 | 
 46 |   uint8_t *I_desc_curr;  
 47 |   uint32_t addr_v0,addr_v1,addr_v2,addr_v3,addr_v4;
 48 |   
 49 |   // do not compute every second line
 50 |   if (half_resolution) {
 51 |   
 52 |     // create filter strip
 53 |     for (int32_t v=4; v<height-3; v+=2) {
 54 | 
 55 |       addr_v2 = v*bpl; //Current line
 56 |       addr_v0 = addr_v2-2*bpl; //2 lines above
 57 |       addr_v1 = addr_v2-1*bpl; //1 lines above
 58 |       addr_v3 = addr_v2+1*bpl; //1 lines below
 59 |       addr_v4 = addr_v2+2*bpl; //2 lines below
 60 | 
 61 |       //Save the surrounding filtered rhombus point of interests (Total of 16 points)
 62 |       //Du is horizontal filter result
 63 |       //Dv is vertical filter result (more horizontal change in stero camera so we can use less vertical stuff)
 64 |       //du :
 65 |       // - - x - -
 66 |       // - x x x -
 67 |       // x x o x x
 68 |       // - x x x -
 69 |       // - - x - -
 70 |       //dv :
 71 |       // - - - - -
 72 |       // - - x - -
 73 |       // - x o x -
 74 |       // - - x - -
 75 |       // - - - - -
 76 |       for (int32_t u=3; u<width-3; u++) {
 77 |         I_desc_curr = I_desc+(v*width+u)*16;
 78 |         *(I_desc_curr++) = *(I_du+addr_v0+u+0);
 79 |         *(I_desc_curr++) = *(I_du+addr_v1+u-2);
 80 |         *(I_desc_curr++) = *(I_du+addr_v1+u+0);
 81 |         *(I_desc_curr++) = *(I_du+addr_v1+u+2);
 82 |         *(I_desc_curr++) = *(I_du+addr_v2+u-1);
 83 |         *(I_desc_curr++) = *(I_du+addr_v2+u+0);
 84 |         *(I_desc_curr++) = *(I_du+addr_v2+u+0);
 85 |         *(I_desc_curr++) = *(I_du+addr_v2+u+1);
 86 |         *(I_desc_curr++) = *(I_du+addr_v3+u-2);
 87 |         *(I_desc_curr++) = *(I_du+addr_v3+u+0);
 88 |         *(I_desc_curr++) = *(I_du+addr_v3+u+2);
 89 |         *(I_desc_curr++) = *(I_du+addr_v4+u+0);
 90 |         *(I_desc_curr++) = *(I_dv+addr_v1+u+0);
 91 |         *(I_desc_curr++) = *(I_dv+addr_v2+u-1);
 92 |         *(I_desc_curr++) = *(I_dv+addr_v2+u+1);
 93 |         *(I_desc_curr++) = *(I_dv+addr_v3+u+0);
 94 |       }
 95 |     }
 96 |     
 97 |   // compute full descriptor images
 98 |   } else {
 99 |     
100 |     // create filter strip
101 |     for (int32_t v=3; v<height-3; v++) {
102 | 
103 |       addr_v2 = v*bpl;
104 |       addr_v0 = addr_v2-2*bpl;
105 |       addr_v1 = addr_v2-1*bpl;
106 |       addr_v3 = addr_v2+1*bpl;
107 |       addr_v4 = addr_v2+2*bpl;
108 | 
109 |       for (int32_t u=3; u<width-3; u++) {
110 |         I_desc_curr = I_desc+(v*width+u)*16;
111 |         *(I_desc_curr++) = *(I_du+addr_v0+u+0);
112 |         *(I_desc_curr++) = *(I_du+addr_v1+u-2);
113 |         *(I_desc_curr++) = *(I_du+addr_v1+u+0);
114 |         *(I_desc_curr++) = *(I_du+addr_v1+u+2);
115 |         *(I_desc_curr++) = *(I_du+addr_v2+u-1);
116 |         *(I_desc_curr++) = *(I_du+addr_v2+u+0);
117 |         *(I_desc_curr++) = *(I_du+addr_v2+u+0);
118 |         *(I_desc_curr++) = *(I_du+addr_v2+u+1);
119 |         *(I_desc_curr++) = *(I_du+addr_v3+u-2);
120 |         *(I_desc_curr++) = *(I_du+addr_v3+u+0);
121 |         *(I_desc_curr++) = *(I_du+addr_v3+u+2);
122 |         *(I_desc_curr++) = *(I_du+addr_v4+u+0);
123 |         *(I_desc_curr++) = *(I_dv+addr_v1+u+0);
124 |         *(I_desc_curr++) = *(I_dv+addr_v2+u-1);
125 |         *(I_desc_curr++) = *(I_dv+addr_v2+u+1);
126 |         *(I_desc_curr++) = *(I_dv+addr_v3+u+0);
127 |       }
128 |     }
129 |   }
130 |   
131 | }
132 | 


--------------------------------------------------------------------------------
/CPU/descriptor.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2011. All rights reserved.
 3 | Institute of Measurement and Control Systems
 4 | Karlsruhe Institute of Technology, Germany
 5 | 
 6 | This file is part of libelas.
 7 | Authors: Andreas Geiger
 8 | 
 9 | libelas is free software; you can redistribute it and/or modify it under the
10 | terms of the GNU General Public License as published by the Free Software
11 | Foundation; either version 3 of the License, or any later version.
12 | 
13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License along with
18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
20 | */
21 | 
22 | // NOTE: This descripter is a sparse approximation to the 50-dimensional
23 | // descriptor described in the paper. It produces similar results, but
24 | // is faster to compute.
25 | 
26 | #ifndef __DESCRIPTOR_H__
27 | #define __DESCRIPTOR_H__
28 | 
29 | #include <iostream>
30 | #include <stdio.h>
31 | #include <string.h>
32 | #include <stdlib.h>
33 | #include <math.h>
34 | 
35 | // Define fixed-width datatypes for Visual Studio projects
36 | #ifndef _MSC_VER
37 |   #include <stdint.h>
38 | #else
39 |   typedef __int8            int8_t;
40 |   typedef __int16           int16_t;
41 |   typedef __int32           int32_t;
42 |   typedef __int64           int64_t;
43 |   typedef unsigned __int8   uint8_t;
44 |   typedef unsigned __int16  uint16_t;
45 |   typedef unsigned __int32  uint32_t;
46 |   typedef unsigned __int64  uint64_t;
47 | #endif
48 | 
49 | class Descriptor {
50 |   
51 | public:
52 |   
53 |   // constructor creates filters
54 |   Descriptor(uint8_t* I,int32_t width,int32_t height,int32_t bpl,bool half_resolution);
55 |   
56 |   // deconstructor releases memory
57 |   ~Descriptor();
58 |   
59 |   // descriptors accessible from outside
60 |   uint8_t* I_desc;
61 |   
62 | private:
63 | 
64 |   // build descriptor I_desc from I_du and I_dv
65 |   void createDescriptor(uint8_t* I_du,uint8_t* I_dv,int32_t width,int32_t height,int32_t bpl,bool half_resolution);
66 | 
67 | };
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/CPU/elas.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | // Main header file. Include this to use libelas in your code.
 23 | 
 24 | #ifndef __ELAS_H__
 25 | #define __ELAS_H__
 26 | 
 27 | // Enable profiling
 28 | #define PROFILE
 29 | 
 30 | #include <iostream>
 31 | #include <stdio.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | #include <vector>
 35 | #include <emmintrin.h>
 36 | 
 37 | // define fixed-width datatypes for Visual Studio projects
 38 | #ifndef _MSC_VER
 39 |   #include <stdint.h>
 40 | #else
 41 |   typedef __int8            int8_t;
 42 |   typedef __int16           int16_t;
 43 |   typedef __int32           int32_t;
 44 |   typedef __int64           int64_t;
 45 |   typedef unsigned __int8   uint8_t;
 46 |   typedef unsigned __int16  uint16_t;
 47 |   typedef unsigned __int32  uint32_t;
 48 |   typedef unsigned __int64  uint64_t;
 49 | #endif
 50 | 
 51 | #ifdef PROFILE
 52 | #include "timer.h"
 53 | #endif
 54 | 
 55 | class Elas {
 56 |   
 57 | public:
 58 |   
 59 |   enum setting {ROBOTICS,MIDDLEBURY};
 60 |   
 61 |   // parameter settings
 62 |   struct parameters {
 63 |     int32_t disp_min;               // min disparity
 64 |     int32_t disp_max;               // max disparity
 65 |     float   support_threshold;      // max. uniqueness ratio (best vs. second best support match)
 66 |     int32_t support_texture;        // min texture for support points
 67 |     int32_t candidate_stepsize;     // step size of regular grid on which support points are matched
 68 |     int32_t incon_window_size;      // window size of inconsistent support point check
 69 |     int32_t incon_threshold;        // disparity similarity threshold for support point to be considered consistent
 70 |     int32_t incon_min_support;      // minimum number of consistent support points
 71 |     bool    add_corners;            // add support points at image corners with nearest neighbor disparities
 72 |     int32_t grid_size;              // size of neighborhood for additional support point extrapolation
 73 |     float   beta;                   // image likelihood parameter
 74 |     float   gamma;                  // prior constant
 75 |     float   sigma;                  // prior sigma
 76 |     float   sradius;                // prior sigma radius
 77 |     int32_t match_texture;          // min texture for dense matching
 78 |     int32_t lr_threshold;           // disparity threshold for left/right consistency check
 79 |     float   speckle_sim_threshold;  // similarity threshold for speckle segmentation
 80 |     int32_t speckle_size;           // maximal size of a speckle (small speckles get removed)
 81 |     int32_t ipol_gap_width;         // interpolate small gaps (left<->right, top<->bottom)
 82 |     bool    filter_median;          // optional median filter (approximated)
 83 |     bool    filter_adaptive_mean;   // optional adaptive mean filter (approximated)
 84 |     bool    postprocess_only_left;  // saves time by not postprocessing the right image
 85 |     bool    subsampling;            // saves time by only computing disparities for each 2nd pixel
 86 |                                     // note: for this option D1 and D2 must be passed with size
 87 |                                     //       width/2 x height/2 (rounded towards zero)
 88 |     
 89 |     // constructor
 90 |     parameters (setting s=ROBOTICS) {
 91 |       
 92 |       // default settings in a robotics environment
 93 |       // (do not produce results in half-occluded areas
 94 |       //  and are a bit more robust towards lighting etc.)
 95 |       if (s==ROBOTICS) {
 96 |         disp_min              = 0;
 97 |         disp_max              = 255;
 98 |         support_threshold     = 0.85;
 99 |         support_texture       = 10;
100 |         candidate_stepsize    = 5;
101 |         incon_window_size     = 5;
102 |         incon_threshold       = 5;
103 |         incon_min_support     = 5;
104 |         add_corners           = 0;
105 |         grid_size             = 20;
106 |         beta                  = 0.02;
107 |         gamma                 = 3;
108 |         sigma                 = 1;
109 |         sradius               = 2;
110 |         match_texture         = 1;
111 |         lr_threshold          = 2;
112 |         speckle_sim_threshold = 1;
113 |         speckle_size          = 200;
114 |         ipol_gap_width        = 3;
115 |         filter_median         = 0;
116 |         filter_adaptive_mean  = 1;
117 |         postprocess_only_left = 1;
118 |         subsampling           = 0;
119 |         
120 |       // default settings for middlebury benchmark
121 |       // (interpolate all missing disparities)
122 |       } else {
123 |         disp_min              = 0;
124 |         disp_max              = 255;
125 |         support_threshold     = 0.95;
126 |         support_texture       = 10;
127 |         candidate_stepsize    = 5;
128 |         incon_window_size     = 5;
129 |         incon_threshold       = 5;
130 |         incon_min_support     = 5;
131 |         add_corners           = 1;
132 |         grid_size             = 20;
133 |         beta                  = 0.02;
134 |         gamma                 = 5;
135 |         sigma                 = 1;
136 |         sradius               = 3;
137 |         match_texture         = 0;
138 |         lr_threshold          = 2;
139 |         speckle_sim_threshold = 1;
140 |         speckle_size          = 200;
141 |         ipol_gap_width        = 5000;
142 |         filter_median         = 1;
143 |         filter_adaptive_mean  = 0;
144 |         postprocess_only_left = 0;
145 |         subsampling           = 0;
146 |       }
147 |     }
148 |   };
149 | 
150 |   // constructor, input: parameters  
151 |   Elas (parameters param) : param(param) {}
152 | 
153 |   // deconstructor
154 |   ~Elas () {}
155 |   
156 |   // matching function
157 |   // inputs: pointers to left (I1) and right (I2) intensity image (uint8, input)
158 |   //         pointers to left (D1) and right (D2) disparity image (float, output)
159 |   //         dims[0] = width of I1 and I2
160 |   //         dims[1] = height of I1 and I2
161 |   //         dims[2] = bytes per line (often equal to width, but allowed to differ)
162 |   //         note: D1 and D2 must be allocated before (bytes per line = width)
163 |   //               if subsampling is not active their size is width x height,
164 |   //               otherwise width/2 x height/2 (rounded towards zero)
165 |   void process (uint8_t* I1,uint8_t* I2,float* D1,float* D2,const int32_t* dims);
166 |   
167 | // This was originally "private"
168 | // Was converted to allow sub-classes to call this
169 | // This assumes the user knows what they are doing
170 | public:
171 |   
172 |   struct support_pt {
173 |     int32_t u;
174 |     int32_t v;
175 |     int32_t d;
176 |     support_pt(int32_t u,int32_t v,int32_t d):u(u),v(v),d(d){}
177 |   };
178 | 
179 |   struct triangle {
180 |     int32_t c1,c2,c3;
181 |     float   t1a,t1b,t1c;
182 |     float   t2a,t2b,t2c;
183 |     triangle(int32_t c1,int32_t c2,int32_t c3):c1(c1),c2(c2),c3(c3){}
184 |   };
185 | 
186 |   inline uint32_t getAddressOffsetImage (const int32_t& u,const int32_t& v,const int32_t& width) {
187 |     return v*width+u;
188 |   }
189 | 
190 |   inline uint32_t getAddressOffsetGrid (const int32_t& x,const int32_t& y,const int32_t& d,const int32_t& width,const int32_t& disp_num) {
191 |     return (y*width+x)*disp_num+d;
192 |   }
193 | 
194 |   // support point functions
195 |   virtual void removeInconsistentSupportPoints (int16_t* D_can,int32_t D_can_width,int32_t D_can_height);
196 |   virtual void removeRedundantSupportPoints (int16_t* D_can,int32_t D_can_width,int32_t D_can_height,
197 |                                      int32_t redun_max_dist, int32_t redun_threshold, bool vertical);
198 |   virtual void addCornerSupportPoints (std::vector<support_pt> &p_support);
199 |   inline int16_t computeMatchingDisparity (const int32_t &u,const int32_t &v,uint8_t* I1_desc,uint8_t* I2_desc,const bool &right_image);
200 |   virtual std::vector<support_pt> computeSupportMatches (uint8_t* I1_desc,uint8_t* I2_desc);
201 | 
202 |   // triangulation & grid
203 |   virtual std::vector<triangle> computeDelaunayTriangulation (std::vector<support_pt> p_support,int32_t right_image);
204 |   virtual void computeDisparityPlanes (std::vector<support_pt> p_support,std::vector<triangle> &tri,int32_t right_image);
205 |   virtual void createGrid (std::vector<support_pt> p_support,int32_t* disparity_grid,int32_t* grid_dims,bool right_image);
206 | 
207 |   // matching
208 |   inline void updatePosteriorMinimum (__m128i* I2_block_addr,const int32_t &d,const int32_t &w,
209 |                                       const __m128i &xmm1,__m128i &xmm2,int32_t &val,int32_t &min_val,int32_t &min_d);
210 |   inline void updatePosteriorMinimum (__m128i* I2_block_addr,const int32_t &d,
211 |                                       const __m128i &xmm1,__m128i &xmm2,int32_t &val,int32_t &min_val,int32_t &min_d);
212 |   inline void findMatch (int32_t &u,int32_t &v,float &plane_a,float &plane_b,float &plane_c,
213 |                          int32_t* disparity_grid,int32_t *grid_dims,uint8_t* I1_desc,uint8_t* I2_desc,
214 |                          int32_t *P,int32_t &plane_radius,bool &valid,bool &right_image,float* D);
215 |   virtual void computeDisparity (std::vector<support_pt> p_support,std::vector<triangle> tri,int32_t* disparity_grid,int32_t* grid_dims,
216 |                          uint8_t* I1_desc,uint8_t* I2_desc,bool right_image,float* D);
217 | 
218 |   // L/R consistency check
219 |   virtual void leftRightConsistencyCheck (float* D1,float* D2);
220 |   
221 |   // postprocessing
222 |   virtual void removeSmallSegments (float* D);
223 |   virtual void gapInterpolation (float* D);
224 | 
225 |   // optional postprocessing
226 |   virtual void adaptiveMean (float* D);
227 |   virtual void median (float* D);
228 |   
229 |   // parameter set
230 |   parameters param;
231 |   
232 |   // memory aligned input images + dimensions
233 |   uint8_t *I1,*I2;
234 |   int32_t width,height,bpl;
235 |   
236 |   // profiling timer
237 | #ifdef PROFILE
238 |   Timer timer;
239 | #endif
240 | };
241 | 
242 | #endif
243 | 


--------------------------------------------------------------------------------
/CPU/filter.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Julius Ziegler, Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | #include <stdio.h>
 23 | #include <string.h>
 24 | #include <cassert>
 25 | 
 26 | #include "filter.h"
 27 | 
 28 | // define fixed-width datatypes for Visual Studio projects
 29 | #ifndef _MSC_VER
 30 |   #include <stdint.h>
 31 | #else
 32 |   typedef __int8            int8_t;
 33 |   typedef __int16           int16_t;
 34 |   typedef __int32           int32_t;
 35 |   typedef __int64           int64_t;
 36 |   typedef unsigned __int8   uint8_t;
 37 |   typedef unsigned __int16  uint16_t;
 38 |   typedef unsigned __int32  uint32_t;
 39 |   typedef unsigned __int64  uint64_t;
 40 | #endif
 41 | 
 42 | // fast filters: implements 3x3 and 5x5 sobel filters and 
 43 | //               5x5 blob and corner filters based on SSE2/3 instructions
 44 | namespace filter {
 45 |   
 46 |   // private namespace, public user functions at the bottom of this file
 47 |   namespace detail {
 48 |     void integral_image( const uint8_t* in, int32_t* out, int w, int h ) {
 49 |       int32_t* out_top = out;
 50 |       const uint8_t* line_end = in + w;
 51 |       const uint8_t* in_end   = in + w*h;
 52 |       int32_t line_sum = 0;
 53 |       for( ; in != line_end; in++, out++ ) {
 54 |         line_sum += *in;
 55 |         *out = line_sum;
 56 |       }
 57 |       for( ; in != in_end; ) {
 58 |         int32_t line_sum = 0;
 59 |         const uint8_t* line_end = in + w;
 60 |         for( ; in != line_end; in++, out++, out_top++ ) {
 61 |           line_sum += *in;
 62 |           *out = *out_top + line_sum;
 63 |         }
 64 |       }
 65 |     }
 66 |     
 67 |     void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) {
 68 |       __m128i zero = _mm_setzero_si128();
 69 |       b0 = _mm_unpacklo_epi8( a, zero );
 70 |       b1 = _mm_unpackhi_epi8( a, zero );
 71 |     }
 72 |     
 73 |     void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) {
 74 |       b = _mm_packus_epi16( a0, a1 );
 75 |     }
 76 |     
 77 |     // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output.
 78 |     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
 79 |     void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
 80 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
 81 |       const __m128i* i0 = (const __m128i*)(in);
 82 |       const int16_t* i1 = in+1;
 83 |       const int16_t* i2 = in+2;
 84 |       const int16_t* i3 = in+3;
 85 |       const int16_t* i4 = in+4;
 86 |       uint8_t* result   = out + 2;
 87 |       const int16_t* const end_input = in + w*h;
 88 |       __m128i offs = _mm_set1_epi16( 128 );
 89 |       for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
 90 |         __m128i result_register_lo;
 91 |         __m128i result_register_hi;
 92 |         for( int i=0; i<2; i++ ) {
 93 |           __m128i* result_register;
 94 |           if( i==0 ) result_register = &result_register_lo;
 95 |           else       result_register = &result_register_hi;
 96 |           __m128i i0_register = *i0;
 97 |           __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
 98 |           __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
 99 |           __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
100 |           __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
101 |           *result_register = _mm_setzero_si128();
102 |           *result_register = _mm_add_epi16( i0_register, *result_register );
103 |           i1_register      = _mm_add_epi16( i1_register, i1_register  );
104 |           i1_register      = _mm_add_epi16( i1_register, i1_register  );
105 |           *result_register = _mm_add_epi16( i1_register, *result_register );
106 |           i2_register      = _mm_add_epi16( i2_register, i2_register  );
107 |           *result_register = _mm_add_epi16( i2_register, *result_register );
108 |           i2_register      = _mm_add_epi16( i2_register, i2_register  );
109 |           *result_register = _mm_add_epi16( i2_register, *result_register );
110 |           i3_register      = _mm_add_epi16( i3_register, i3_register  );
111 |           i3_register      = _mm_add_epi16( i3_register, i3_register  );
112 |           *result_register = _mm_add_epi16( i3_register, *result_register );
113 |           *result_register = _mm_add_epi16( i4_register, *result_register );
114 |           *result_register = _mm_srai_epi16( *result_register, 7 );
115 |           *result_register = _mm_add_epi16( *result_register, offs );
116 |           if( i==0 ) {
117 |             i0 += 1;
118 |             i1 += 8;
119 |             i2 += 8;
120 |             i3 += 8;
121 |             i4 += 8;
122 |           }
123 |         }
124 |         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
125 |         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
126 |       }
127 |     }
128 |     
129 |     // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output.
130 |     // This one works on 16bit input and 8bit output.
131 |     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
132 |     void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
133 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
134 |       const __m128i*  i0 = (const __m128i*)(in);
135 |       const int16_t* 	i1 = in+1;
136 |       const int16_t* 	i3 = in+3;
137 |       const int16_t* 	i4 = in+4;
138 |       uint8_t* result    = out + 2;
139 |       const int16_t* const end_input = in + w*h;
140 |       __m128i offs = _mm_set1_epi16( 128 );
141 |       for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
142 |         __m128i result_register_lo;
143 |         __m128i result_register_hi;
144 |         for( int i=0; i<2; i++ ) {
145 |           __m128i* result_register;
146 |           if( i==0 ) result_register = &result_register_lo;
147 |           else       result_register = &result_register_hi;
148 |           __m128i i0_register = *i0;
149 |           __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
150 |           __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
151 |           __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
152 |           *result_register = _mm_setzero_si128();
153 |           *result_register = _mm_add_epi16( i0_register,   *result_register );
154 |           i1_register      = _mm_add_epi16( i1_register, i1_register  );
155 |           *result_register = _mm_add_epi16( i1_register,   *result_register );
156 |           i3_register      = _mm_add_epi16( i3_register, i3_register  );
157 |           *result_register = _mm_sub_epi16( *result_register, i3_register );
158 |           *result_register = _mm_sub_epi16( *result_register, i4_register );
159 |           *result_register = _mm_srai_epi16( *result_register, 7 );
160 |           *result_register = _mm_add_epi16( *result_register, offs );
161 |           if( i==0 ) {
162 |             i0 += 1;
163 |             i1 += 8;
164 |             i3 += 8;
165 |             i4 += 8;
166 |           }
167 |         }
168 |         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
169 |         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
170 |       }
171 |     }
172 | 
173 |     // convolve image with a (1,2,1) row vector. Result is accumulated into output.
174 |     // This one works on 16bit input and 8bit output.
175 |     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
176 |     void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
177 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
178 |       const __m128i* i0 = (const __m128i*)(in);
179 |       const int16_t* i1 = in+1;
180 |       const int16_t* i2 = in+2;
181 |       uint8_t* result   = out + 1;
182 |       const int16_t* const end_input = in + w*h;
183 |       const size_t blocked_loops = (w*h-2)/16;
184 |       __m128i offs = _mm_set1_epi16( 128 );
185 |       for( size_t i=0; i != blocked_loops; i++ ) {
186 |         __m128i result_register_lo;
187 |         __m128i result_register_hi;
188 |         __m128i i1_register;
189 |         __m128i i2_register;
190 |         
191 |         i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
192 |         i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
193 |         result_register_lo = *i0;
194 |         i1_register        = _mm_add_epi16( i1_register, i1_register );
195 |         result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
196 |         result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
197 |         result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
198 |         result_register_lo = _mm_add_epi16( result_register_lo, offs );
199 | 
200 |         i0++;
201 |         i1+=8;
202 |         i2+=8;
203 | 
204 |         i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
205 |         i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
206 |         result_register_hi = *i0;
207 |         i1_register        = _mm_add_epi16( i1_register, i1_register );
208 |         result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
209 |         result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
210 |         result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
211 |         result_register_hi = _mm_add_epi16( result_register_hi, offs );
212 | 
213 |         i0++;
214 |         i1+=8;
215 |         i2+=8;
216 | 
217 |         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
218 |         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
219 |       
220 |         result += 16;
221 |       }
222 |     }
223 |     
224 |     // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
225 |     // This one works on 16bit input and 8bit output.
226 |     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
227 |     void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
228 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
229 |       const __m128i*  i0 = (const __m128i*)(in);
230 |       const int16_t* 	i2 = in+2;
231 |       uint8_t* result    = out + 1;
232 |       const int16_t* const end_input = in + w*h;
233 |       const size_t blocked_loops = (w*h-2)/16;
234 |       __m128i offs = _mm_set1_epi16( 128 );
235 |       for( size_t i=0; i != blocked_loops; i++ ) {
236 |         __m128i result_register_lo;
237 |         __m128i result_register_hi;
238 |         __m128i i2_register;
239 | 
240 |         i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
241 |         result_register_lo  = *i0;
242 |         result_register_lo  = _mm_sub_epi16( result_register_lo, i2_register );
243 |         result_register_lo  = _mm_srai_epi16( result_register_lo, 2 );
244 |         result_register_lo  = _mm_add_epi16( result_register_lo, offs );
245 |  
246 |         i0 += 1;
247 |         i2 += 8;
248 |         
249 |         i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
250 |         result_register_hi  = *i0;
251 |         result_register_hi  = _mm_sub_epi16( result_register_hi, i2_register );
252 |         result_register_hi  = _mm_srai_epi16( result_register_hi, 2 );
253 |         result_register_hi  = _mm_add_epi16( result_register_hi, offs );
254 | 
255 |         i0 += 1;
256 |         i2 += 8;
257 |         
258 |         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
259 |         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
260 | 
261 |         result += 16;
262 |       }
263 | 
264 |       for( ; i2 < end_input; i2++, result++) {
265 |         *result = ((*(i2-2) - *i2)>>2)+128;
266 |       }
267 |     }
268 |     
269 |     void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
270 |       using namespace std;
271 |       memset( out_h, 0, w*h*sizeof(int16_t) );
272 |       memset( out_v, 0, w*h*sizeof(int16_t) );
273 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
274 |       const int w_chunk  = w/16;
275 |       __m128i* 	i0       = (__m128i*)( in );
276 |       __m128i* 	i1       = (__m128i*)( in ) + w_chunk*1;
277 |       __m128i* 	i2       = (__m128i*)( in ) + w_chunk*2;
278 |       __m128i* 	i3       = (__m128i*)( in ) + w_chunk*3;
279 |       __m128i* 	i4       = (__m128i*)( in ) + w_chunk*4;
280 |       __m128i* result_h  = (__m128i*)( out_h ) + 4*w_chunk;
281 |       __m128i* result_v  = (__m128i*)( out_v ) + 4*w_chunk;
282 |       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
283 |       __m128i sixes      = _mm_set1_epi16( 6 );
284 |       __m128i fours      = _mm_set1_epi16( 4 );
285 |       for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {      
286 |         __m128i ilo, ihi;
287 |         unpack_8bit_to_16bit( *i0, ihi, ilo );
288 |         *result_h     = _mm_add_epi16( ihi, *result_h );
289 |         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
290 |         *result_v     = _mm_add_epi16( *result_v, ihi );
291 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
292 |         unpack_8bit_to_16bit( *i1, ihi, ilo );
293 |         *result_h     = _mm_add_epi16( ihi, *result_h );
294 |         *result_h     = _mm_add_epi16( ihi, *result_h );
295 |         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
296 |         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
297 |         ihi = _mm_mullo_epi16( ihi, fours );
298 |         ilo = _mm_mullo_epi16( ilo, fours );
299 |         *result_v     = _mm_add_epi16( *result_v, ihi );
300 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
301 |         unpack_8bit_to_16bit( *i2, ihi, ilo );
302 |         ihi = _mm_mullo_epi16( ihi, sixes );
303 |         ilo = _mm_mullo_epi16( ilo, sixes );
304 |         *result_v     = _mm_add_epi16( *result_v, ihi );
305 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
306 |         unpack_8bit_to_16bit( *i3, ihi, ilo );
307 |         *result_h     = _mm_sub_epi16( *result_h, ihi );
308 |         *result_h     = _mm_sub_epi16( *result_h, ihi );
309 |         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
310 |         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
311 |         ihi = _mm_mullo_epi16( ihi, fours );
312 |         ilo = _mm_mullo_epi16( ilo, fours );
313 |         *result_v     = _mm_add_epi16( *result_v, ihi );
314 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );          
315 |         unpack_8bit_to_16bit( *i4, ihi, ilo );
316 |         *result_h     = _mm_sub_epi16( *result_h, ihi );
317 |         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
318 |         *result_v     = _mm_add_epi16( *result_v, ihi );
319 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
320 |       }
321 |     }
322 |     
323 |     void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) {
324 |       memset( out, 0, w*h*sizeof(int16_t) );
325 |       using namespace std;
326 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
327 |       const int w_chunk  = w/16;
328 |       __m128i* 	i0       = (__m128i*)( in );
329 |       __m128i* 	i1       = (__m128i*)( in ) + w_chunk*1;
330 |       __m128i* 	i3       = (__m128i*)( in ) + w_chunk*3;
331 |       __m128i* 	i4       = (__m128i*)( in ) + w_chunk*4;
332 |       __m128i* result    = (__m128i*)( out ) + 4*w_chunk;
333 |       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
334 |       for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
335 |         __m128i ilo0, ihi0;
336 |         unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
337 |         __m128i ilo1, ihi1;
338 |         unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
339 |         *result     = _mm_add_epi16( ihi0, ihi1 );
340 |         *(result+1) = _mm_add_epi16( ilo0, ilo1 );
341 |         __m128i ilo, ihi;
342 |         unpack_8bit_to_16bit( *i3, ihi, ilo );
343 |         *result     = _mm_sub_epi16( *result, ihi );
344 |         *(result+1) = _mm_sub_epi16( *(result+1), ilo );
345 |         unpack_8bit_to_16bit( *i4, ihi, ilo );
346 |         *result     = _mm_sub_epi16( *result, ihi );
347 |         *(result+1) = _mm_sub_epi16( *(result+1), ilo );
348 |       }
349 |     }
350 |     
351 |     void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) {
352 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
353 |       const __m128i*  i0 = (const __m128i*)(in);
354 |       const int16_t* 	i1 = in+1;
355 |       const int16_t* 	i3 = in+3;
356 |       const int16_t* 	i4 = in+4;
357 |       int16_t* result    = out + 2;
358 |       const int16_t* const end_input = in + w*h;
359 |       for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
360 |         __m128i result_register;
361 |         __m128i i0_register = *i0;
362 |         __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
363 |         __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
364 |         __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
365 |         result_register     = _mm_add_epi16( i0_register,     i1_register );
366 |         result_register     = _mm_sub_epi16( result_register, i3_register );
367 |         result_register     = _mm_sub_epi16( result_register, i4_register );
368 |         _mm_storeu_si128( ((__m128i*)( result )), result_register );
369 |       }
370 |     }
371 |     
372 |     void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
373 |       using namespace std;
374 |       assert( w % 16 == 0 && "width must be multiple of 16!" );
375 |       const int w_chunk  = w/16;
376 |       __m128i* 	i0       = (__m128i*)( in );
377 |       __m128i* 	i1       = (__m128i*)( in ) + w_chunk*1;
378 |       __m128i* 	i2       = (__m128i*)( in ) + w_chunk*2;
379 |       __m128i* result_h  = (__m128i*)( out_h ) + 2*w_chunk;
380 |       __m128i* result_v  = (__m128i*)( out_v ) + 2*w_chunk;
381 |       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
382 |       for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
383 |         *result_h     = _mm_setzero_si128();
384 |         *(result_h+1) = _mm_setzero_si128();
385 |         *result_v     = _mm_setzero_si128();
386 |         *(result_v+1) = _mm_setzero_si128();
387 |         __m128i ilo, ihi;
388 |         unpack_8bit_to_16bit( *i0, ihi, ilo ); 
389 |         unpack_8bit_to_16bit( *i0, ihi, ilo );
390 |         *result_h     = _mm_add_epi16( ihi, *result_h );
391 |         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
392 |         *result_v     = _mm_add_epi16( *result_v, ihi );
393 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
394 |         unpack_8bit_to_16bit( *i1, ihi, ilo );
395 |         *result_v     = _mm_add_epi16( *result_v, ihi );
396 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
397 |         *result_v     = _mm_add_epi16( *result_v, ihi );
398 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
399 |         unpack_8bit_to_16bit( *i2, ihi, ilo );
400 |         *result_h     = _mm_sub_epi16( *result_h, ihi );
401 |         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
402 |         *result_v     = _mm_add_epi16( *result_v, ihi );
403 |         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
404 |       }
405 |     }
406 |   };
407 |   
408 |   void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
409 |     int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
410 |     int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );    
411 |     detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
412 |     detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
413 |     detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
414 |     _mm_free( temp_h );
415 |     _mm_free( temp_v );
416 |   }
417 |   
418 |   void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
419 |     int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
420 |     int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
421 |     detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
422 |     detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
423 |     detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
424 |     _mm_free( temp_h );
425 |     _mm_free( temp_v );
426 |   }
427 |   
428 |   // -1 -1  0  1  1
429 |   // -1 -1  0  1  1
430 |   //  0  0  0  0  0
431 |   //  1  1  0 -1 -1
432 |   //  1  1  0 -1 -1
433 |   void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) {
434 |     int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
435 |     detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
436 |     detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
437 |     _mm_free( temp );
438 |   }
439 |   
440 |   // -1 -1 -1 -1 -1
441 |   // -1  1  1  1 -1
442 |   // -1  1  8  1 -1
443 |   // -1  1  1  1 -1
444 |   // -1 -1 -1 -1 -1
445 |   void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
446 |     int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
447 |     detail::integral_image( in, integral, w, h );
448 |     int16_t* out_ptr   = out + 3 + 3*w;
449 |     int16_t* out_end   = out + w * h - 2 - 2*w;
450 |     const int32_t* i00 = integral;
451 |     const int32_t* i50 = integral + 5;
452 |     const int32_t* i05 = integral + 5*w;
453 |     const int32_t* i55 = integral + 5 + 5*w;
454 |     const int32_t* i11 = integral + 1 + 1*w;
455 |     const int32_t* i41 = integral + 4 + 1*w;
456 |     const int32_t* i14 = integral + 1 + 4*w;
457 |     const int32_t* i44 = integral + 4 + 4*w;    
458 |     const uint8_t* im22 = in + 3 + 3*w;
459 |     for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
460 |       int32_t result = 0;
461 |       result = -( *i55 - *i50 - *i05 + *i00 );
462 |       result += 2*( *i44 - *i41 - *i14 + *i11 );
463 |       result += 7* *im22;
464 |       *out_ptr = result;
465 |     }
466 |     _mm_free( integral );
467 |   }
468 | };
469 | 


--------------------------------------------------------------------------------
/CPU/filter.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Julius Ziegler, Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | #ifndef __FILTER_H__
 23 | #define __FILTER_H__
 24 | 
 25 | #include <emmintrin.h>
 26 | #include <pmmintrin.h>
 27 | 
 28 | // define fixed-width datatypes for Visual Studio projects
 29 | #ifndef _MSC_VER
 30 |   #include <stdint.h>
 31 | #else
 32 |   typedef __int8            int8_t;
 33 |   typedef __int16           int16_t;
 34 |   typedef __int32           int32_t;
 35 |   typedef __int64           int64_t;
 36 |   typedef unsigned __int8   uint8_t;
 37 |   typedef unsigned __int16  uint16_t;
 38 |   typedef unsigned __int32  uint32_t;
 39 |   typedef unsigned __int64  uint64_t;
 40 | #endif
 41 | 
 42 | // fast filters: implements 3x3 and 5x5 sobel filters and 
 43 | //               5x5 blob and corner filters based on SSE2/3 instructions
 44 | namespace filter {
 45 |   
 46 |   // private namespace, public user functions at the bottom of this file
 47 |   namespace detail {
 48 |     void integral_image( const uint8_t* in, int32_t* out, int w, int h );
 49 |     void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 );
 50 |     void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b );
 51 |     
 52 |     // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output.
 53 |     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
 54 |     void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h );
 55 |     
 56 |     // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output.
 57 |     // This one works on 16bit input and 8bit output.
 58 |     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
 59 |     void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h );
 60 | 
 61 |     // convolve image with a (1,2,1) row vector. Result is accumulated into output.
 62 |     // This one works on 16bit input and 8bit output.
 63 |     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
 64 |     void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h );
 65 |     
 66 |     // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
 67 |     // This one works on 16bit input and 8bit output.
 68 |     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
 69 |     void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h );
 70 |     
 71 |     void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h );
 72 |     
 73 |     void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h );
 74 |     
 75 |     void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h );
 76 |     
 77 |     void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h );
 78 |   }
 79 |   
 80 |   void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h );
 81 |   
 82 |   void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h );
 83 |   
 84 |   // -1 -1  0  1  1
 85 |   // -1 -1  0  1  1
 86 |   //  0  0  0  0  0
 87 |   //  1  1  0 -1 -1
 88 |   //  1  1  0 -1 -1
 89 |   void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h );
 90 |   
 91 |   // -1 -1 -1 -1 -1
 92 |   // -1  1  1  1 -1
 93 |   // -1  1  8  1 -1
 94 |   // -1  1  1  1 -1
 95 |   // -1 -1 -1 -1 -1
 96 |   void blob5x5( const uint8_t* in, int16_t* out, int w, int h );
 97 | };
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/CPU/image.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | // basic image I/O, based on Pedro Felzenszwalb's code
 23 | 
 24 | #ifndef IMAGE_H
 25 | #define IMAGE_H
 26 | 
 27 | #include <cstdlib>
 28 | #include <climits>
 29 | #include <cstring>
 30 | #include <fstream>
 31 | 
 32 | // use imRef to access image data.
 33 | #define imRef(im, x, y) (im->access[y][x])
 34 |   
 35 | // use imPtr to get pointer to image data.
 36 | #define imPtr(im, x, y) &(im->access[y][x])
 37 | 
 38 | #define BUF_SIZE 256
 39 | 
 40 | typedef unsigned char uchar;
 41 | typedef struct { uchar r, g, b; } rgb;
 42 | 
 43 | inline bool operator==(const rgb &a, const rgb &b) {
 44 |   return ((a.r == b.r) && (a.g == b.g) && (a.b == b.b));
 45 | }
 46 | 
 47 | // image class
 48 | template <class T> class image {
 49 | public:
 50 | 
 51 |   // create image
 52 |   image(const int width, const int height, const bool init = false);
 53 | 
 54 |   // delete image
 55 |   ~image();
 56 | 
 57 |   // init image
 58 |   void init(const T &val);
 59 | 
 60 |   // deep copy
 61 |   image<T> *copy() const;
 62 |   
 63 |   // get image width/height
 64 |   int width() const { return w; }
 65 |   int height() const { return h; }
 66 |   
 67 |   // image data
 68 |   T *data;
 69 |   
 70 |   // row pointers
 71 |   T **access;
 72 |   
 73 | private:
 74 |   int w, h;
 75 | };
 76 | 
 77 | template <class T> image<T>::image(const int width, const int height, const bool init) {
 78 |   w = width;
 79 |   h = height;
 80 |   data = new T[w * h];  // allocate space for image data
 81 |   access = new T*[h];   // allocate space for row pointers
 82 |   
 83 |   // initialize row pointers
 84 |   for (int i = 0; i < h; i++)
 85 |     access[i] = data + (i * w);  
 86 |   
 87 |   // init to zero
 88 |   if (init)
 89 |     memset(data, 0, w * h * sizeof(T));
 90 | }
 91 | 
 92 | template <class T> image<T>::~image() {
 93 |   delete [] data; 
 94 |   delete [] access;
 95 | }
 96 | 
 97 | template <class T> void image<T>::init(const T &val) {
 98 |   T *ptr = imPtr(this, 0, 0);
 99 |   T *end = imPtr(this, w-1, h-1);
100 |   while (ptr <= end)
101 |     *ptr++ = val;
102 | }
103 | 
104 | 
105 | template <class T> image<T> *image<T>::copy() const {
106 |   image<T> *im = new image<T>(w, h, false);
107 |   memcpy(im->data, data, w * h * sizeof(T));
108 |   return im;
109 | }
110 | 
111 | class pnm_error {};
112 | 
113 | void pnm_read(std::ifstream &file, char *buf) {
114 |   char doc[BUF_SIZE];
115 |   char c;
116 |   
117 |   file >> c;
118 |   while (c == '#') {
119 |     file.getline(doc, BUF_SIZE);
120 |     file >> c;
121 |   }
122 |   file.putback(c);
123 |   
124 |   file.width(BUF_SIZE);
125 |   file >> buf;
126 |   file.ignore();
127 | }
128 | 
129 | image<uchar> *loadPGM(const char *name) {
130 |   char buf[BUF_SIZE];
131 |   
132 |   // read header
133 |   std::ifstream file(name, std::ios::in | std::ios::binary);
134 |   pnm_read(file, buf);
135 |   if (strncmp(buf, "P5", 2)) {
136 |     std::cout << "ERROR: Could not read file " << name << std::endl;
137 |     throw pnm_error();
138 |   }
139 | 
140 |   pnm_read(file, buf);
141 |   int width = atoi(buf);
142 |   pnm_read(file, buf);
143 |   int height = atoi(buf);
144 | 
145 |   pnm_read(file, buf);
146 |   if (atoi(buf) > UCHAR_MAX) {
147 |     std::cout << "ERROR: Could not read file " << name << std::endl;
148 |     throw pnm_error();
149 |   }
150 | 
151 |   // read data
152 |   image<uchar> *im = new image<uchar>(width, height);
153 |   file.read((char *)imPtr(im, 0, 0), width * height * sizeof(uchar));
154 | 
155 |   return im;
156 | }
157 | 
158 | void savePGM(image<uchar> *im, const char *name) {
159 |   int width = im->width();
160 |   int height = im->height();
161 |   std::ofstream file(name, std::ios::out | std::ios::binary);
162 | 
163 |   file << "P5\n" << width << " " << height << "\n" << UCHAR_MAX << "\n";
164 |   file.write((char *)imPtr(im, 0, 0), width * height * sizeof(uchar));
165 | }
166 | 
167 | #endif
168 | 


--------------------------------------------------------------------------------
/CPU/matrix.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libviso2.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libviso2 is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 2 of the License, or any later version.
 12 | 
 13 | libviso2 is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libviso2; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | #include "matrix.h"
 23 | #include <algorithm>
 24 | #include <math.h>
 25 | 
 26 | #define SWAP(a,b) {temp=a;a=b;b=temp;}
 27 | #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
 28 | static FLOAT sqrarg;
 29 | #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
 30 | static FLOAT maxarg1,maxarg2;
 31 | #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
 32 | static int32_t iminarg1,iminarg2;
 33 | #define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
 34 | 
 35 | 
 36 | using namespace std;
 37 | 
 38 | Matrix::Matrix () {
 39 |   m   = 0;
 40 |   n   = 0;
 41 |   val = 0;
 42 | }
 43 | 
 44 | Matrix::Matrix (const int32_t m_,const int32_t n_) {
 45 |   allocateMemory(m_,n_);
 46 | }
 47 | 
 48 | Matrix::Matrix (const int32_t m_,const int32_t n_,const FLOAT* val_) {
 49 |   allocateMemory(m_,n_);
 50 |   int32_t k=0;
 51 |   for (int32_t i=0; i<m_; i++)
 52 |     for (int32_t j=0; j<n_; j++)
 53 |       val[i][j] = val_[k++];
 54 | }
 55 | 
 56 | Matrix::Matrix (const Matrix &M) {
 57 |   allocateMemory(M.m,M.n);
 58 |   for (int32_t i=0; i<M.m; i++)
 59 |     memcpy(val[i],M.val[i],M.n*sizeof(FLOAT));
 60 | }
 61 | 
 62 | Matrix::~Matrix () {
 63 |   releaseMemory();
 64 | }
 65 | 
 66 | Matrix& Matrix::operator= (const Matrix &M) {
 67 |   if (this!=&M) {
 68 |     if (M.m!=m || M.n!=n) {
 69 |       releaseMemory();
 70 |       allocateMemory(M.m,M.n);
 71 |     }
 72 |     if (M.n>0)
 73 |       for (int32_t i=0; i<M.m; i++)
 74 |         memcpy(val[i],M.val[i],M.n*sizeof(FLOAT));
 75 |   }
 76 |   return *this;
 77 | }
 78 | 
 79 | void Matrix::getData(FLOAT* val_,int32_t i1,int32_t j1,int32_t i2,int32_t j2) {
 80 |   if (i2==-1) i2 = m-1;
 81 |   if (j2==-1) j2 = n-1;
 82 |   int32_t k=0;
 83 |   for (int32_t i=i1; i<=i2; i++)
 84 |     for (int32_t j=j1; j<=j2; j++)
 85 |       val_[k++] = val[i][j];
 86 | }
 87 | 
 88 | Matrix Matrix::getMat(int32_t i1,int32_t j1,int32_t i2,int32_t j2) {
 89 |   if (i2==-1) i2 = m-1;
 90 |   if (j2==-1) j2 = n-1;
 91 |   if (i1<0 || i2>=m || j1<0 || j2>=n || i2<i1 || j2<j1) {
 92 |     cerr << "ERROR: Cannot get submatrix [" << i1 << ".." << i2 <<
 93 |         "] x [" << j1 << ".." << j2 << "]" <<
 94 |         " of a (" << m << "x" << n << ") matrix." << endl;
 95 |     exit(0);
 96 |   }
 97 |   Matrix M(i2-i1+1,j2-j1+1);
 98 |   for (int32_t i=0; i<M.m; i++)
 99 |     for (int32_t j=0; j<M.n; j++)
100 |       M.val[i][j] = val[i1+i][j1+j];
101 |   return M;
102 | }
103 | 
104 | void Matrix::setMat(const Matrix &M,const int32_t i1,const int32_t j1) {
105 |   if (i1<0 || j1<0 || i1+M.m>m || j1+M.n>n) {
106 |     cerr << "ERROR: Cannot set submatrix [" << i1 << ".." << i1+M.m-1 <<
107 |         "] x [" << j1 << ".." << j1+M.n-1 << "]" <<
108 |         " of a (" << m << "x" << n << ") matrix." << endl;
109 |     exit(0);
110 |   }
111 |   for (int32_t i=0; i<M.m; i++)
112 |     for (int32_t j=0; j<M.n; j++)
113 |       val[i1+i][j1+j] = M.val[i][j];
114 | }
115 | 
116 | void Matrix::setVal(FLOAT s,int32_t i1,int32_t j1,int32_t i2,int32_t j2) {
117 |   if (i2==-1) i2 = m-1;
118 |   if (j2==-1) j2 = n-1;
119 |   if (i2<i1 || j2<j1) {
120 |     cerr << "ERROR in setVal: Indices must be ordered (i1<=i2, j1<=j2)." << endl;
121 |     exit(0);
122 |   }
123 |   for (int32_t i=i1; i<=i2; i++)
124 |     for (int32_t j=j1; j<=j2; j++)
125 |       val[i][j] = s;
126 | }
127 | 
128 | void Matrix::setDiag(FLOAT s,int32_t i1,int32_t i2) {
129 |   if (i2==-1) i2 = min(m-1,n-1);
130 |   for (int32_t i=i1; i<=i2; i++)
131 |     val[i][i] = s;
132 | }
133 | 
134 | void Matrix::zero() {
135 |   setVal(0);
136 | }
137 | 
138 | Matrix Matrix::extractCols (vector<int> idx) {
139 |   Matrix M(m,idx.size());
140 |   for (int32_t j=0; j<M.n; j++)
141 |     if (idx[j]<n)
142 |       for (int32_t i=0; i<m; i++)
143 |         M.val[i][j] = val[i][idx[j]];
144 |   return M;
145 | }
146 | 
147 | Matrix Matrix::eye (const int32_t m) {
148 |   Matrix M(m,m);
149 |   for (int32_t i=0; i<m; i++)
150 |     M.val[i][i] = 1;
151 |   return M;
152 | }
153 | 
154 | void Matrix::eye () {
155 |   for (int32_t i=0; i<m; i++)
156 |     for (int32_t j=0; j<n; j++)
157 |       val[i][j] = 0;
158 |   for (int32_t i=0; i<min(m,n); i++)
159 |     val[i][i] = 1;
160 | }
161 | 
162 | Matrix Matrix::diag (const Matrix &M) {
163 |   if (M.m>1 && M.n==1) {
164 |     Matrix D(M.m,M.m);
165 |     for (int32_t i=0; i<M.m; i++)
166 |       D.val[i][i] = M.val[i][0];
167 |     return D;
168 |   } else if (M.m==1 && M.n>1) {
169 |     Matrix D(M.n,M.n);
170 |     for (int32_t i=0; i<M.n; i++)
171 |       D.val[i][i] = M.val[0][i];
172 |     return D;
173 |   }
174 |   cout << "ERROR: Trying to create diagonal matrix from vector of size (" << M.m << "x" << M.n << ")" << endl;
175 |   exit(0);
176 | }
177 | 
178 | Matrix Matrix::reshape(const Matrix &M,int32_t m_,int32_t n_) {
179 |   if (M.m*M.n != m_*n_) {
180 |     cerr << "ERROR: Trying to reshape a matrix of size (" << M.m << "x" << M.n <<
181 |             ") to size (" << m_ << "x" << n_ << ")" << endl;
182 |     exit(0);
183 |   }
184 |   Matrix M2(m_,n_);
185 |   for (int32_t k=0; k<m_*n_; k++) {
186 |     int32_t i1 = k/M.n;
187 |     int32_t j1 = k%M.n;
188 |     int32_t i2 = k/n_;
189 |     int32_t j2 = k%n_;
190 |     M2.val[i2][j2] = M.val[i1][j1];
191 |   }
192 |   return M2;
193 | }
194 | 
195 | Matrix Matrix::rotMatX (const FLOAT &angle) {
196 |   FLOAT s = sin(angle);
197 |   FLOAT c = cos(angle);
198 |   Matrix R(3,3);
199 |   R.val[0][0] = +1;
200 |   R.val[1][1] = +c;
201 |   R.val[1][2] = -s;
202 |   R.val[2][1] = +s;
203 |   R.val[2][2] = +c;
204 |   return R;
205 | }
206 | 
207 | Matrix Matrix::rotMatY (const FLOAT &angle) {
208 |   FLOAT s = sin(angle);
209 |   FLOAT c = cos(angle);
210 |   Matrix R(3,3);
211 |   R.val[0][0] = +c;
212 |   R.val[0][2] = +s;
213 |   R.val[1][1] = +1;
214 |   R.val[2][0] = -s;
215 |   R.val[2][2] = +c;
216 |   return R;
217 | }
218 | 
219 | Matrix Matrix::rotMatZ (const FLOAT &angle) {
220 |   FLOAT s = sin(angle);
221 |   FLOAT c = cos(angle);
222 |   Matrix R(3,3);
223 |   R.val[0][0] = +c;
224 |   R.val[0][1] = -s;
225 |   R.val[1][0] = +s;
226 |   R.val[1][1] = +c;
227 |   R.val[2][2] = +1;
228 |   return R;
229 | }
230 | 
231 | Matrix Matrix::operator+ (const Matrix &M) {
232 |   const Matrix &A = *this;
233 |   const Matrix &B = M;
234 |   if (A.m!=B.m || A.n!=B.n) {
235 |     cerr << "ERROR: Trying to add matrices of size (" << A.m << "x" << A.n <<
236 |         ") and (" << B.m << "x" << B.n << ")" << endl;
237 |     exit(0);
238 |   }
239 |   Matrix C(A.m,A.n);
240 |   for (int32_t i=0; i<m; i++)
241 |     for (int32_t j=0; j<n; j++)
242 |       C.val[i][j] = A.val[i][j]+B.val[i][j];
243 |   return C;
244 | }
245 | 
246 | Matrix Matrix::operator- (const Matrix &M) {
247 |   const Matrix &A = *this;
248 |   const Matrix &B = M;
249 |   if (A.m!=B.m || A.n!=B.n) {
250 |     cerr << "ERROR: Trying to subtract matrices of size (" << A.m << "x" << A.n <<
251 |         ") and (" << B.m << "x" << B.n << ")" << endl;
252 |     exit(0);
253 |   }
254 |   Matrix C(A.m,A.n);
255 |   for (int32_t i=0; i<m; i++)
256 |     for (int32_t j=0; j<n; j++)
257 |       C.val[i][j] = A.val[i][j]-B.val[i][j];
258 |   return C;
259 | }
260 | 
261 | Matrix Matrix::operator* (const Matrix &M) {
262 |   const Matrix &A = *this;
263 |   const Matrix &B = M;
264 |   if (A.n!=B.m) {
265 |     cerr << "ERROR: Trying to multiply matrices of size (" << A.m << "x" << A.n <<
266 |         ") and (" << B.m << "x" << B.n << ")" << endl;
267 |     exit(0);
268 |   }
269 |   Matrix C(A.m,B.n);
270 |   for (int32_t i=0; i<A.m; i++)
271 |     for (int32_t j=0; j<B.n; j++)
272 |       for (int32_t k=0; k<A.n; k++)
273 |         C.val[i][j] += A.val[i][k]*B.val[k][j];
274 |   return C;
275 | }
276 | 
277 | Matrix Matrix::operator* (const FLOAT &s) {
278 |   Matrix C(m,n);
279 |   for (int32_t i=0; i<m; i++)
280 |     for (int32_t j=0; j<n; j++)
281 |       C.val[i][j] = val[i][j]*s;
282 |   return C;
283 | }
284 | 
285 | Matrix Matrix::operator/ (const Matrix &M) {
286 |   const Matrix &A = *this;
287 |   const Matrix &B = M;
288 |   
289 |   if (A.m==B.m && A.n==B.n) {
290 |     Matrix C(A.m,A.n);
291 |     for (int32_t i=0; i<A.m; i++)
292 |       for (int32_t j=0; j<A.n; j++)
293 |         if (B.val[i][j]!=0)
294 |           C.val[i][j] = A.val[i][j]/B.val[i][j];
295 |     return C;
296 |     
297 |   } else if (A.m==B.m && B.n==1) {
298 |     Matrix C(A.m,A.n);
299 |     for (int32_t i=0; i<A.m; i++)
300 |       for (int32_t j=0; j<A.n; j++)
301 |         if (B.val[i][0]!=0)
302 |           C.val[i][j] = A.val[i][j]/B.val[i][0];
303 |     return C;
304 |     
305 |   } else if (A.n==B.n && B.m==1) {
306 |     Matrix C(A.m,A.n);
307 |     for (int32_t i=0; i<A.m; i++)
308 |       for (int32_t j=0; j<A.n; j++)
309 |         if (B.val[0][j]!=0)
310 |           C.val[i][j] = A.val[i][j]/B.val[0][j];
311 |     return C;
312 |     
313 |   } else {
314 |     cerr << "ERROR: Trying to divide matrices of size (" << A.m << "x" << A.n <<
315 |         ") and (" << B.m << "x" << B.n << ")" << endl;
316 |     exit(0);
317 |   } 
318 | }
319 | 
320 | Matrix Matrix::operator/ (const FLOAT &s) {
321 |   if (fabs(s)<1e-20) {
322 |     cerr << "ERROR: Trying to divide by zero!" << endl;
323 |     exit(0);
324 |   }
325 |   Matrix C(m,n);
326 |   for (int32_t i=0; i<m; i++)
327 |     for (int32_t j=0; j<n; j++)
328 |       C.val[i][j] = val[i][j]/s;
329 |   return C;
330 | }
331 | 
332 | Matrix Matrix::operator- () {
333 |   Matrix C(m,n);
334 |   for (int32_t i=0; i<m; i++)
335 |     for (int32_t j=0; j<n; j++)
336 |       C.val[i][j] = -val[i][j];
337 |   return C;
338 | }
339 | 
340 | Matrix Matrix::operator~ () {
341 |   Matrix C(n,m);
342 |   for (int32_t i=0; i<m; i++)
343 |     for (int32_t j=0; j<n; j++)
344 |       C.val[j][i] = val[i][j];
345 |   return C;
346 | }
347 | 
348 | FLOAT Matrix::l2norm () {
349 |   FLOAT norm = 0;
350 |   for (int32_t i=0; i<m; i++)
351 |     for (int32_t j=0; j<n; j++)
352 |       norm += val[i][j]*val[i][j];
353 |   return sqrt(norm);
354 | }
355 | 
356 | FLOAT Matrix::mean () {
357 |   FLOAT mean = 0;
358 |   for (int32_t i=0; i<m; i++)
359 |     for (int32_t j=0; j<n; j++)
360 |       mean += val[i][j];
361 |   return mean/(FLOAT)(m*n);
362 | }
363 | 
364 | Matrix Matrix::cross (const Matrix &a, const Matrix &b) {
365 |   if (a.m!=3 || a.n!=1 || b.m!=3 || b.n!=1) {
366 |     cerr << "ERROR: Cross product vectors must be of size (3x1)" << endl;
367 |     exit(0);
368 |   }
369 |   Matrix c(3,1);
370 |   c.val[0][0] = a.val[1][0]*b.val[2][0]-a.val[2][0]*b.val[1][0];
371 |   c.val[1][0] = a.val[2][0]*b.val[0][0]-a.val[0][0]*b.val[2][0];
372 |   c.val[2][0] = a.val[0][0]*b.val[1][0]-a.val[1][0]*b.val[0][0];
373 |   return c;
374 | }
375 | 
376 | Matrix Matrix::inv (const Matrix &M) {
377 |   if (M.m!=M.n) {
378 |     cerr << "ERROR: Trying to invert matrix of size (" << M.m << "x" << M.n << ")" << endl;
379 |     exit(0);
380 |   }
381 |   Matrix A(M);
382 |   Matrix B = eye(M.m);
383 |   B.solve(A);
384 |   return B;
385 | }
386 | 
387 | bool Matrix::inv () {
388 |   if (m!=n) {
389 |     cerr << "ERROR: Trying to invert matrix of size (" << m << "x" << n << ")" << endl;
390 |     exit(0);
391 |   }
392 |   Matrix A(*this);
393 |   eye();
394 |   solve(A);
395 |   return true;
396 | }
397 | 
398 | FLOAT Matrix::det () {
399 |   
400 |   if (m != n) {
401 |     cerr << "ERROR: Trying to compute determinant of a matrix of size (" << m << "x" << n << ")" << endl;
402 |     exit(0);
403 |   }
404 |     
405 |   Matrix A(*this);
406 |   int32_t *idx = (int32_t*)malloc(m*sizeof(int32_t));
407 |   FLOAT d;
408 |   A.lu(idx,d);
409 |   for( int32_t i=0; i<m; i++)
410 |     d *= A.val[i][i];
411 |   free(idx);
412 | }
413 | 
414 | bool Matrix::solve (const Matrix &M, FLOAT eps) {
415 |   
416 |   // substitutes
417 |   const Matrix &A = M;
418 |   Matrix &B       = *this;
419 |   
420 |   if (A.m != A.n || A.m != B.m || A.m<1 || B.n<1) {
421 |     cerr << "ERROR: Trying to eliminate matrices of size (" << A.m << "x" << A.n <<
422 |             ") and (" << B.m << "x" << B.n << ")" << endl;
423 |     exit(0);
424 |   }
425 |   
426 |   // index vectors for bookkeeping on the pivoting
427 |   int32_t* indxc = new int32_t[m];
428 |   int32_t* indxr = new int32_t[m];
429 |   int32_t* ipiv  = new int32_t[m];
430 |   
431 |   // loop variables
432 |   int32_t i, icol, irow, j, k, l, ll;
433 |   FLOAT big, dum, pivinv, temp;
434 |   
435 |   // initialize pivots to zero
436 |   for (j=0;j<m;j++) ipiv[j]=0;
437 |   
438 |   // main loop over the columns to be reduced
439 |   for (i=0;i<m;i++) {
440 |     
441 |     big=0.0;
442 |     
443 |     // search for a pivot element
444 |     for (j=0;j<m;j++)
445 |       if (ipiv[j]!=1)
446 |         for (k=0;k<m;k++)
447 |           if (ipiv[k]==0)
448 |             if (fabs(A.val[j][k])>=big) {
449 |       big=fabs(A.val[j][k]);
450 |       irow=j;
451 |       icol=k;
452 |             }
453 |     ++(ipiv[icol]);
454 |     
455 |     // We now have the pivot element, so we interchange rows, if needed, to put the pivot
456 |     // element on the diagonal. The columns are not physically interchanged, only relabeled.
457 |     if (irow != icol) {
458 |       for (l=0;l<m;l++) SWAP(A.val[irow][l], A.val[icol][l])
459 |       for (l=0;l<n;l++) SWAP(B.val[irow][l], B.val[icol][l])
460 |     }
461 |     
462 |     indxr[i]=irow; // We are now ready to divide the pivot row by the
463 |     indxc[i]=icol; // pivot element, located at irow and icol.
464 |     
465 |     // check for singularity
466 |     if (fabs(A.val[icol][icol]) < eps) {
467 |       delete[] indxc;
468 |       delete[] indxr;
469 |       delete[] ipiv;
470 |       return false;
471 |     }
472 |     
473 |     pivinv=1.0/A.val[icol][icol];
474 |     A.val[icol][icol]=1.0;
475 |     for (l=0;l<m;l++) A.val[icol][l] *= pivinv;
476 |     for (l=0;l<n;l++) B.val[icol][l] *= pivinv;
477 |     
478 |     // Next, we reduce the rows except for the pivot one
479 |     for (ll=0;ll<m;ll++)
480 |       if (ll!=icol) {
481 |       dum = A.val[ll][icol];
482 |       A.val[ll][icol] = 0.0;
483 |       for (l=0;l<m;l++) A.val[ll][l] -= A.val[icol][l]*dum;
484 |       for (l=0;l<n;l++) B.val[ll][l] -= B.val[icol][l]*dum;
485 |       }
486 |   }
487 |   
488 |   // This is the end of the main loop over columns of the reduction. It only remains to unscramble
489 |   // the solution in view of the column interchanges. We do this by interchanging pairs of
490 |   // columns in the reverse order that the permutation was built up.
491 |   for (l=m-1;l>=0;l--) {
492 |     if (indxr[l]!=indxc[l])
493 |       for (k=0;k<m;k++)
494 |         SWAP(A.val[k][indxr[l]], A.val[k][indxc[l]])
495 |   }
496 |   
497 |   // success
498 |   delete[] indxc;
499 |   delete[] indxr;
500 |   delete[] ipiv;
501 |   return true;
502 | }
503 | 
504 | // Given a matrix a[1..n][1..n], this routine replaces it by the LU decomposition of a rowwise
505 | // permutation of itself. a and n are input. a is output, arranged as in equation (2.3.14) above;
506 | // indx[1..n] is an output vector that records the row permutation effected by the partial
507 | // pivoting; d is output as ±1 depending on whether the number of row interchanges was even
508 | // or odd, respectively. This routine is used in combination with lubksb to solve linear equations
509 | // or invert a matrix.
510 | 
511 | bool Matrix::lu(int32_t *idx, FLOAT &d, FLOAT eps) {
512 |   
513 |   if (m != n) {
514 |     cerr << "ERROR: Trying to LU decompose a matrix of size (" << m << "x" << n << ")" << endl;
515 |     exit(0);
516 |   }
517 |   
518 |   int32_t i,imax,j,k;
519 |   FLOAT   big,dum,sum,temp;
520 |   FLOAT* vv = (FLOAT*)malloc(n*sizeof(FLOAT)); // vv stores the implicit scaling of each row.
521 |   d = 1.0;
522 |   for (i=0; i<n; i++) { // Loop over rows to get the implicit scaling information.
523 |     big = 0.0;
524 |     for (j=0; j<n; j++)
525 |       if ((temp=fabs(val[i][j]))>big)
526 |         big = temp;
527 |     if (big == 0.0) { // No nonzero largest element.
528 |       free(vv);
529 |       return false;
530 |     }
531 |     vv[i] = 1.0/big; // Save the scaling.
532 |   }
533 |   for (j=0; j<n; j++) { // This is the loop over columns of Crout’s method.
534 |     for (i=0; i<j; i++) { // This is equation (2.3.12) except for i = j.
535 |       sum = val[i][j];
536 |       for (k=0; k<i; k++)
537 |         sum -= val[i][k]*val[k][j];
538 |       val[i][j] = sum;
539 |     }
540 |     big = 0.0; // Initialize the search for largest pivot element.
541 |     for (i=j; i<n; i++) {
542 |       sum = val[i][j];
543 |       for (k=0; k<j; k++)
544 |         sum -= val[i][k]*val[k][j];
545 |       val[i][j] = sum;
546 |       if ( (dum=vv[i]*fabs(sum))>=big) {
547 |         big  = dum;
548 |         imax = i;
549 |       }
550 |     }
551 |     if (j!=imax) { // Do we need to interchange rows?
552 |       for (k=0; k<n; k++) { // Yes, do so...
553 |         dum          = val[imax][k];
554 |         val[imax][k] = val[j][k];
555 |         val[j][k]    = dum;
556 |       }
557 |       d = -d;     // ...and change the parity of d.
558 |       vv[imax]=vv[j]; // Also interchange the scale factor.
559 |     }
560 |     idx[j] = imax;
561 |     if (j!=n-1) { // Now, finally, divide by the pivot element.
562 |       dum = 1.0/val[j][j];
563 |       for (i=j+1; i<n; i++)
564 |         val[i][j] *= dum;
565 |     }
566 |   } // Go back for the next column in the reduction.
567 |   
568 |   // success
569 |   free(vv);
570 |   return true;
571 | }
572 | 
573 | // Given a matrix M/A[1..m][1..n], this routine computes its singular value decomposition, M/A =
574 | // U·W·V T. Thematrix U replaces a on output. The diagonal matrix of singular values W is output
575 | // as a vector w[1..n]. Thematrix V (not the transpose V T ) is output as v[1..n][1..n].
576 | void Matrix::svd(Matrix &U2,Matrix &W,Matrix &V) {
577 | 
578 |   Matrix U = Matrix(*this);
579 |   U2 = Matrix(m,m);
580 |   V  = Matrix(n,n);
581 | 
582 |   FLOAT* w   = (FLOAT*)malloc(n*sizeof(FLOAT));
583 |   FLOAT* rv1 = (FLOAT*)malloc(n*sizeof(FLOAT));
584 | 
585 |   int32_t flag,i,its,j,jj,k,l,nm;
586 |   FLOAT   anorm,c,f,g,h,s,scale,x,y,z;
587 | 
588 |   g = scale = anorm = 0.0; // Householder reduction to bidiagonal form.
589 |   for (i=0;i<n;i++) {
590 |     l = i+1;
591 |     rv1[i] = scale*g;
592 |     g = s = scale = 0.0;
593 |     if (i < m) {
594 |       for (k=i;k<m;k++) scale += fabs(U.val[k][i]);
595 |       if (scale) {
596 |         for (k=i;k<m;k++) {
597 |           U.val[k][i] /= scale;
598 |           s += U.val[k][i]*U.val[k][i];
599 |         }
600 |         f = U.val[i][i];
601 |         g = -SIGN(sqrt(s),f);
602 |         h = f*g-s;
603 |         U.val[i][i] = f-g;
604 |         for (j=l;j<n;j++) {
605 |           for (s=0.0,k=i;k<m;k++) s += U.val[k][i]*U.val[k][j];
606 |           f = s/h;
607 |           for (k=i;k<m;k++) U.val[k][j] += f*U.val[k][i];
608 |         }
609 |         for (k=i;k<m;k++) U.val[k][i] *= scale;
610 |       }
611 |     }
612 |     w[i] = scale*g;
613 |     g = s = scale = 0.0;
614 |     if (i<m && i!=n-1) {
615 |       for (k=l;k<n;k++) scale += fabs(U.val[i][k]);
616 |       if (scale) {
617 |         for (k=l;k<n;k++) {
618 |           U.val[i][k] /= scale;
619 |           s += U.val[i][k]*U.val[i][k];
620 |         }
621 |         f = U.val[i][l];
622 |         g = -SIGN(sqrt(s),f);
623 |         h = f*g-s;
624 |         U.val[i][l] = f-g;
625 |         for (k=l;k<n;k++) rv1[k] = U.val[i][k]/h;
626 |         for (j=l;j<m;j++) {
627 |           for (s=0.0,k=l;k<n;k++) s += U.val[j][k]*U.val[i][k];
628 |           for (k=l;k<n;k++) U.val[j][k] += s*rv1[k];
629 |         }
630 |         for (k=l;k<n;k++) U.val[i][k] *= scale;
631 |       }
632 |     }
633 |     anorm = FMAX(anorm,(fabs(w[i])+fabs(rv1[i])));
634 |   }
635 |   for (i=n-1;i>=0;i--) { // Accumulation of right-hand transformations.
636 |     if (i<n-1) {
637 |       if (g) {
638 |         for (j=l;j<n;j++) // Double division to avoid possible underflow.
639 |           V.val[j][i]=(U.val[i][j]/U.val[i][l])/g;
640 |         for (j=l;j<n;j++) {
641 |           for (s=0.0,k=l;k<n;k++) s += U.val[i][k]*V.val[k][j];
642 |           for (k=l;k<n;k++) V.val[k][j] += s*V.val[k][i];
643 |         }
644 |       }
645 |       for (j=l;j<n;j++) V.val[i][j] = V.val[j][i] = 0.0;
646 |     }
647 |     V.val[i][i] = 1.0;
648 |     g = rv1[i];
649 |     l = i;
650 |   }
651 |   for (i=IMIN(m,n)-1;i>=0;i--) { // Accumulation of left-hand transformations.
652 |     l = i+1;
653 |     g = w[i];
654 |     for (j=l;j<n;j++) U.val[i][j] = 0.0;
655 |     if (g) {
656 |       g = 1.0/g;
657 |       for (j=l;j<n;j++) {
658 |         for (s=0.0,k=l;k<m;k++) s += U.val[k][i]*U.val[k][j];
659 |         f = (s/U.val[i][i])*g;
660 |         for (k=i;k<m;k++) U.val[k][j] += f*U.val[k][i];
661 |       }
662 |       for (j=i;j<m;j++) U.val[j][i] *= g;
663 |     } else for (j=i;j<m;j++) U.val[j][i]=0.0;
664 |     ++U.val[i][i];
665 |   }
666 |   for (k=n-1;k>=0;k--) { // Diagonalization of the bidiagonal form: Loop over singular values,
667 |     for (its=0;its<30;its++) { // and over allowed iterations.
668 |       flag = 1;
669 |       for (l=k;l>=0;l--) { // Test for splitting.
670 |         nm = l-1;
671 |         if ((FLOAT)(fabs(rv1[l])+anorm) == anorm) { flag = 0; break; }
672 |         if ((FLOAT)(fabs( w[nm])+anorm) == anorm) { break; }
673 |       }
674 |       if (flag) {
675 |         c = 0.0; // Cancellation of rv1[l], if l > 1.
676 |         s = 1.0;
677 |         for (i=l;i<=k;i++) {
678 |           f = s*rv1[i];
679 |           rv1[i] = c*rv1[i];
680 |           if ((FLOAT)(fabs(f)+anorm) == anorm) break;
681 |           g = w[i];
682 |           h = pythag(f,g);
683 |           w[i] = h;
684 |           h = 1.0/h;
685 |           c = g*h;
686 |           s = -f*h;
687 |           for (j=0;j<m;j++) {
688 |             y = U.val[j][nm];
689 |             z = U.val[j][i];
690 |             U.val[j][nm] = y*c+z*s;
691 |             U.val[j][i]  = z*c-y*s;
692 |           }
693 |         }
694 |       }
695 |       z = w[k];
696 |       if (l==k) { // Convergence.
697 |         if (z<0.0) { // Singular value is made nonnegative.
698 |           w[k] = -z;
699 |           for (j=0;j<n;j++) V.val[j][k] = -V.val[j][k];
700 |         }
701 |         break;
702 |       }
703 |       if (its == 29)
704 |         cerr << "ERROR in SVD: No convergence in 30 iterations" << endl;
705 |       x = w[l]; // Shift from bottom 2-by-2 minor.
706 |       nm = k-1;
707 |       y = w[nm];
708 |       g = rv1[nm];
709 |       h = rv1[k];
710 |       f = ((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
711 |       g = pythag(f,1.0);
712 |       f = ((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
713 |       c = s = 1.0; // Next QR transformation:
714 |       for (j=l;j<=nm;j++) {
715 |         i = j+1;
716 |         g = rv1[i];
717 |         y = w[i];
718 |         h = s*g;
719 |         g = c*g;
720 |         z = pythag(f,h);
721 |         rv1[j] = z;
722 |         c = f/z;
723 |         s = h/z;
724 |         f = x*c+g*s;
725 |         g = g*c-x*s;
726 |         h = y*s;
727 |         y *= c;
728 |         for (jj=0;jj<n;jj++) {
729 |           x = V.val[jj][j];
730 |           z = V.val[jj][i];
731 |           V.val[jj][j] = x*c+z*s;
732 |           V.val[jj][i] = z*c-x*s;
733 |         }
734 |         z = pythag(f,h);
735 |         w[j] = z; // Rotation can be arbitrary if z = 0.
736 |         if (z) {
737 |           z = 1.0/z;
738 |           c = f*z;
739 |           s = h*z;
740 |         }
741 |         f = c*g+s*y;
742 |         x = c*y-s*g;
743 |         for (jj=0;jj<m;jj++) {
744 |           y = U.val[jj][j];
745 |           z = U.val[jj][i];
746 |           U.val[jj][j] = y*c+z*s;
747 |           U.val[jj][i] = z*c-y*s;
748 |         }
749 |       }
750 |       rv1[l] = 0.0;
751 |       rv1[k] = f;
752 |       w[k] = x;
753 |     }
754 |   }
755 |   
756 |   // sort singular values and corresponding columns of u and v
757 |   // by decreasing magnitude. Also, signs of corresponding columns are
758 |   // flipped so as to maximize the number of positive elements.
759 |   int32_t s2,inc=1;
760 |   FLOAT   sw;
761 |   FLOAT* su = (FLOAT*)malloc(m*sizeof(FLOAT));
762 |   FLOAT* sv = (FLOAT*)malloc(n*sizeof(FLOAT));
763 |   do { inc *= 3; inc++; } while (inc <= n);
764 |   do {
765 |     inc /= 3;
766 |     for (i=inc;i<n;i++) {
767 |       sw = w[i];
768 |       for (k=0;k<m;k++) su[k] = U.val[k][i];
769 |       for (k=0;k<n;k++) sv[k] = V.val[k][i];
770 |       j = i;
771 |       while (w[j-inc] < sw) {
772 |         w[j] = w[j-inc];
773 |         for (k=0;k<m;k++) U.val[k][j] = U.val[k][j-inc];
774 |         for (k=0;k<n;k++) V.val[k][j] = V.val[k][j-inc];
775 |         j -= inc;
776 |         if (j < inc) break;
777 |       }
778 |       w[j] = sw;
779 |       for (k=0;k<m;k++) U.val[k][j] = su[k];
780 |       for (k=0;k<n;k++) V.val[k][j] = sv[k];
781 |     }
782 |   } while (inc > 1);
783 |   for (k=0;k<n;k++) { // flip signs
784 |     s2=0;
785 |     for (i=0;i<m;i++) if (U.val[i][k] < 0.0) s2++;
786 |     for (j=0;j<n;j++) if (V.val[j][k] < 0.0) s2++;
787 |     if (s2 > (m+n)/2) {
788 |       for (i=0;i<m;i++) U.val[i][k] = -U.val[i][k];
789 |       for (j=0;j<n;j++) V.val[j][k] = -V.val[j][k];
790 |     }
791 |   }
792 | 
793 |   // create vector and copy singular values
794 |   W = Matrix(min(m,n),1,w);
795 |   
796 |   // extract mxm submatrix U
797 |   U2.setMat(U.getMat(0,0,m-1,min(m-1,n-1)),0,0);
798 | 
799 |   // release temporary memory
800 |   free(w);
801 |   free(rv1);
802 |   free(su);
803 |   free(sv);
804 | }
805 | 
806 | ostream& operator<< (ostream& out,const Matrix& M) {
807 |   if (M.m==0 || M.n==0) {
808 |     out << "[empty matrix]";
809 |   } else {
810 |     char buffer[1024];
811 |     for (int32_t i=0; i<M.m; i++) {
812 |       for (int32_t j=0; j<M.n; j++) {
813 |         sprintf(buffer,"%12.7f ",M.val[i][j]);
814 |         out << buffer;
815 |       }
816 |       if (i<M.m-1)
817 |         out << endl;
818 |     }
819 |   }
820 |   return out;
821 | }
822 | 
823 | void Matrix::allocateMemory (const int32_t m_,const int32_t n_) {
824 |   m = abs(m_); n = abs(n_);
825 |   if (m==0 || n==0) {
826 |     val = 0;
827 |     return;
828 |   }
829 |   val    = (FLOAT**)malloc(m*sizeof(FLOAT*));
830 |   val[0] = (FLOAT*)calloc(m*n,sizeof(FLOAT));
831 |   for(int32_t i=1; i<m; i++)
832 |     val[i] = val[i-1]+n;
833 | }
834 | 
835 | void Matrix::releaseMemory () {
836 |   if (val!=0) {
837 |     free(val[0]);
838 |     free(val);
839 |   }
840 | }
841 | 
842 | FLOAT Matrix::pythag(FLOAT a,FLOAT b) {
843 |   FLOAT absa,absb;
844 |   absa = fabs(a);
845 |   absb = fabs(b);
846 |   if (absa > absb)
847 |     return absa*sqrt(1.0+SQR(absb/absa));
848 |   else
849 |     return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb)));
850 | }
851 | 
852 | 


--------------------------------------------------------------------------------
/CPU/matrix.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libviso2.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libviso2 is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 2 of the License, or any later version.
 12 | 
 13 | libviso2 is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libviso2; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | #ifndef MATRIX_H
 23 | #define MATRIX_H
 24 | 
 25 | #include <stdio.h>
 26 | #include <string.h>
 27 | #include <stdlib.h>
 28 | #include <iostream>
 29 | #include <vector>
 30 | 
 31 | #ifndef _MSC_VER
 32 |   #include <stdint.h>
 33 | #else
 34 |   typedef __int8            int8_t;
 35 |   typedef __int16           int16_t;
 36 |   typedef __int32           int32_t;
 37 |   typedef __int64           int64_t;
 38 |   typedef unsigned __int8   uint8_t;
 39 |   typedef unsigned __int16  uint16_t;
 40 |   typedef unsigned __int32  uint32_t;
 41 |   typedef unsigned __int64  uint64_t;
 42 | #endif
 43 | 
 44 | #define endll endl << endl // double end line definition
 45 | 
 46 | typedef double FLOAT;      // double precision
 47 | //typedef float  FLOAT;    // single precision
 48 | 
 49 | class Matrix {
 50 | 
 51 | public:
 52 | 
 53 |   // constructor / deconstructor
 54 |   Matrix ();                                                  // init empty 0x0 matrix
 55 |   Matrix (const int32_t m,const int32_t n);                   // init empty mxn matrix
 56 |   Matrix (const int32_t m,const int32_t n,const FLOAT* val_); // init mxn matrix with values from array 'val'
 57 |   Matrix (const Matrix &M);                                   // creates deepcopy of M
 58 |   ~Matrix ();
 59 | 
 60 |   // assignment operator, copies contents of M
 61 |   Matrix& operator= (const Matrix &M);
 62 | 
 63 |   // copies submatrix of M into array 'val', default values copy whole row/column/matrix
 64 |   void getData(FLOAT* val_,int32_t i1=0,int32_t j1=0,int32_t i2=-1,int32_t j2=-1);
 65 | 
 66 |   // set or get submatrices of current matrix
 67 |   Matrix getMat(int32_t i1,int32_t j1,int32_t i2=-1,int32_t j2=-1);
 68 |   void   setMat(const Matrix &M,const int32_t i,const int32_t j);
 69 | 
 70 |   // set sub-matrix to scalar (default 0), -1 as end replaces whole row/column/matrix
 71 |   void setVal(FLOAT s,int32_t i1=0,int32_t j1=0,int32_t i2=-1,int32_t j2=-1);
 72 | 
 73 |   // set (part of) diagonal to scalar, -1 as end replaces whole diagonal
 74 |   void setDiag(FLOAT s,int32_t i1=0,int32_t i2=-1);
 75 | 
 76 |   // clear matrix
 77 |   void zero();
 78 |   
 79 |   // extract columns with given index
 80 |   Matrix extractCols (std::vector<int> idx);
 81 | 
 82 |   // create identity matrix
 83 |   static Matrix eye (const int32_t m);
 84 |   void          eye ();
 85 | 
 86 |   // create diagonal matrix with nx1 or 1xn matrix M as elements
 87 |   static Matrix diag(const Matrix &M);
 88 |   
 89 |   // returns the m-by-n matrix whose elements are taken column-wise from M
 90 |   static Matrix reshape(const Matrix &M,int32_t m,int32_t n);
 91 | 
 92 |   // create 3x3 rotation matrices (convention: http://en.wikipedia.org/wiki/Rotation_matrix)
 93 |   static Matrix rotMatX(const FLOAT &angle);
 94 |   static Matrix rotMatY(const FLOAT &angle);
 95 |   static Matrix rotMatZ(const FLOAT &angle);
 96 | 
 97 |   // simple arithmetic operations
 98 |   Matrix  operator+ (const Matrix &M); // add matrix
 99 |   Matrix  operator- (const Matrix &M); // subtract matrix
100 |   Matrix  operator* (const Matrix &M); // multiply with matrix
101 |   Matrix  operator* (const FLOAT &s);  // multiply with scalar
102 |   Matrix  operator/ (const Matrix &M); // divide elementwise by matrix (or vector)
103 |   Matrix  operator/ (const FLOAT &s);  // divide by scalar
104 |   Matrix  operator- ();                // negative matrix
105 |   Matrix  operator~ ();                // transpose
106 |   FLOAT   l2norm ();                   // euclidean norm (vectors) / frobenius norm (matrices)
107 |   FLOAT   mean ();                     // mean of all elements in matrix
108 | 
109 |   // complex arithmetic operations
110 |   static Matrix cross (const Matrix &a, const Matrix &b);    // cross product of two vectors
111 |   static Matrix inv (const Matrix &M);                       // invert matrix M
112 |   bool   inv ();                                             // invert this matrix
113 |   FLOAT  det ();                                             // returns determinant of matrix
114 |   bool   solve (const Matrix &M,FLOAT eps=1e-20);            // solve linear system M*x=B, replaces *this and M
115 |   bool   lu(int32_t *idx, FLOAT &d, FLOAT eps=1e-20);        // replace *this by lower upper decomposition
116 |   void   svd(Matrix &U,Matrix &W,Matrix &V);                 // singular value decomposition *this = U*diag(W)*V^T
117 | 
118 |   // print matrix to stream
119 |   friend std::ostream& operator<< (std::ostream& out,const Matrix& M);
120 | 
121 |   // direct data access
122 |   FLOAT   **val;
123 |   int32_t   m,n;
124 | 
125 | private:
126 | 
127 |   void allocateMemory (const int32_t m_,const int32_t n_);
128 |   void releaseMemory ();
129 |   inline FLOAT pythag(FLOAT a,FLOAT b);
130 | 
131 | };
132 | 
133 | #endif // MATRIX_H
134 | 


--------------------------------------------------------------------------------
/CPU/timer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | #ifndef __TIMER_H__
 23 | #define __TIMER_H__
 24 | 
 25 | #include <iostream>
 26 | #include <iomanip>
 27 | #include <stdio.h>
 28 | #include <string.h>
 29 | #include <stdlib.h>
 30 | #include <vector>
 31 | #include <string>
 32 | #include <sys/time.h>
 33 | 
 34 | // Define fixed-width datatypes for Visual Studio projects
 35 | #ifndef _MSC_VER
 36 |   #include <stdint.h>
 37 | #else
 38 |   typedef __int8            int8_t;
 39 |   typedef __int16           int16_t;
 40 |   typedef __int32           int32_t;
 41 |   typedef __int64           int64_t;
 42 |   typedef unsigned __int8   uint8_t;
 43 |   typedef unsigned __int16  uint16_t;
 44 |   typedef unsigned __int32  uint32_t;
 45 |   typedef unsigned __int64  uint64_t;
 46 | #endif
 47 | 
 48 | class Timer {
 49 |   
 50 | public:
 51 |   
 52 |   Timer() {}
 53 |   
 54 |   ~Timer() {}
 55 |   
 56 |   void start (std::string title) {
 57 |     desc.push_back(title);
 58 |     push_back_time();
 59 |   }
 60 |   
 61 |   void stop () {
 62 |     if (time.size()<=desc.size())
 63 |       push_back_time();
 64 |   }
 65 |   
 66 |   void plot () {
 67 |     stop();
 68 |     float total_time = 0;
 69 |     for (int32_t i=0; i<desc.size(); i++) {
 70 |       float curr_time = getTimeDifferenceMilliseconds(time[i],time[i+1]);
 71 |       total_time += curr_time;
 72 |       std::cout.width(30);
 73 |       std::cout << desc[i] << " ";
 74 |       std::cout << std::fixed << std::setprecision(1) << std::setw(6);
 75 |       std::cout << curr_time;
 76 |       std::cout << " ms" << std::endl;
 77 |     }
 78 |     std::cout << "========================================" << std::endl;
 79 |     std::cout << "                    Total time ";
 80 |     std::cout << std::fixed << std::setprecision(1) << std::setw(6);
 81 |     std::cout << total_time;
 82 |     std::cout << " ms" << std::endl << std::endl;
 83 |   }
 84 |   
 85 |   void reset () {
 86 |     desc.clear();
 87 |     time.clear();
 88 |   }
 89 |   
 90 | private:
 91 |   
 92 |   std::vector<std::string>  desc;
 93 |   std::vector<timeval>      time;
 94 |   
 95 |   void push_back_time () {
 96 |     timeval curr_time;
 97 |     gettimeofday(&curr_time,0);
 98 |     time.push_back(curr_time);
 99 |   }
100 |   
101 |   float getTimeDifferenceMilliseconds(timeval a,timeval b) {
102 |     return ((float)(b.tv_sec -a.tv_sec ))*1e+3 +
103 |            ((float)(b.tv_usec-a.tv_usec))*1e-3;
104 |   }
105 | };
106 | 
107 | #endif
108 | 


--------------------------------------------------------------------------------
/CPU/triangle.h:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************/
  2 | /*                                                                           */
  3 | /*  (triangle.h)                                                             */
  4 | /*                                                                           */
  5 | /*  Include file for programs that call Triangle.                            */
  6 | /*                                                                           */
  7 | /*  Accompanies Triangle Version 1.6                                         */
  8 | /*  July 28, 2005                                                            */
  9 | /*                                                                           */
 10 | /*  Copyright 1996, 2005                                                     */
 11 | /*  Jonathan Richard Shewchuk                                                */
 12 | /*  2360 Woolsey #H                                                          */
 13 | /*  Berkeley, California  94705-1927                                         */
 14 | /*  jrs@cs.berkeley.edu                                                      */
 15 | /*                                                                           */
 16 | /*  Modified by Andreas Geiger, 2011                                         */
 17 | /*****************************************************************************/
 18 | 
 19 | /*****************************************************************************/
 20 | /*                                                                           */
 21 | /*  How to call Triangle from another program                                */
 22 | /*                                                                           */
 23 | /*                                                                           */
 24 | /*  If you haven't read Triangle's instructions (run "triangle -h" to read   */
 25 | /*  them), you won't understand what follows.                                */
 26 | /*                                                                           */
 27 | /*  Triangle must be compiled into an object file (triangle.o) with the      */
 28 | /*  TRILIBRARY symbol defined (generally by using the -DTRILIBRARY compiler  */
 29 | /*  switch).  The makefile included with Triangle will do this for you if    */
 30 | /*  you run "make trilibrary".  The resulting object file can be called via  */
 31 | /*  the procedure triangulate().                                             */
 32 | /*                                                                           */
 33 | /*  If the size of the object file is important to you, you may wish to      */
 34 | /*  generate a reduced version of triangle.o.  The REDUCED symbol gets rid   */
 35 | /*  of all features that are primarily of research interest.  Specifically,  */
 36 | /*  the -DREDUCED switch eliminates Triangle's -i, -F, -s, and -C switches.  */
 37 | /*  The CDT_ONLY symbol gets rid of all meshing algorithms above and beyond  */
 38 | /*  constrained Delaunay triangulation.  Specifically, the -DCDT_ONLY switch */
 39 | /*  eliminates Triangle's -r, -q, -a, -u, -D, -Y, -S, and -s switches.       */
 40 | /*                                                                           */
 41 | /*  IMPORTANT:  These definitions (TRILIBRARY, REDUCED, CDT_ONLY) must be    */
 42 | /*  made in the makefile or in triangle.c itself.  Putting these definitions */
 43 | /*  in this file (triangle.h) will not create the desired effect.            */
 44 | /*                                                                           */
 45 | /*                                                                           */
 46 | /*  The calling convention for triangulate() follows.                        */
 47 | /*                                                                           */
 48 | /*      void triangulate(triswitches, in, out, vorout)                       */
 49 | /*      char *triswitches;                                                   */
 50 | /*      struct triangulateio *in;                                            */
 51 | /*      struct triangulateio *out;                                           */
 52 | /*      struct triangulateio *vorout;                                        */
 53 | /*                                                                           */
 54 | /*  `triswitches' is a string containing the command line switches you wish  */
 55 | /*  to invoke.  No initial dash is required.  Some suggestions:              */
 56 | /*                                                                           */
 57 | /*  - You'll probably find it convenient to use the `z' switch so that       */
 58 | /*    points (and other items) are numbered from zero.  This simplifies      */
 59 | /*    indexing, because the first item of any type always starts at index    */
 60 | /*    [0] of the corresponding array, whether that item's number is zero or  */
 61 | /*    one.                                                                   */
 62 | /*  - You'll probably want to use the `Q' (quiet) switch in your final code, */
 63 | /*    but you can take advantage of Triangle's printed output (including the */
 64 | /*    `V' switch) while debugging.                                           */
 65 | /*  - If you are not using the `q', `a', `u', `D', `j', or `s' switches,     */
 66 | /*    then the output points will be identical to the input points, except   */
 67 | /*    possibly for the boundary markers.  If you don't need the boundary     */
 68 | /*    markers, you should use the `N' (no nodes output) switch to save       */
 69 | /*    memory.  (If you do need boundary markers, but need to save memory, a  */
 70 | /*    good nasty trick is to set out->pointlist equal to in->pointlist       */
 71 | /*    before calling triangulate(), so that Triangle overwrites the input    */
 72 | /*    points with identical copies.)                                         */
 73 | /*  - The `I' (no iteration numbers) and `g' (.off file output) switches     */
 74 | /*    have no effect when Triangle is compiled with TRILIBRARY defined.      */
 75 | /*                                                                           */
 76 | /*  `in', `out', and `vorout' are descriptions of the input, the output,     */
 77 | /*  and the Voronoi output.  If the `v' (Voronoi output) switch is not used, */
 78 | /*  `vorout' may be NULL.  `in' and `out' may never be NULL.                 */
 79 | /*                                                                           */
 80 | /*  Certain fields of the input and output structures must be initialized,   */
 81 | /*  as described below.                                                      */
 82 | /*                                                                           */
 83 | /*****************************************************************************/
 84 | 
 85 | /*****************************************************************************/
 86 | /*                                                                           */
 87 | /*  The `triangulateio' structure.                                           */
 88 | /*                                                                           */
 89 | /*  Used to pass data into and out of the triangulate() procedure.           */
 90 | /*                                                                           */
 91 | /*                                                                           */
 92 | /*  Arrays are used to store points, triangles, markers, and so forth.  In   */
 93 | /*  all cases, the first item in any array is stored starting at index [0].  */
 94 | /*  However, that item is item number `1' unless the `z' switch is used, in  */
 95 | /*  which case it is item number `0'.  Hence, you may find it easier to      */
 96 | /*  index points (and triangles in the neighbor list) if you use the `z'     */
 97 | /*  switch.  Unless, of course, you're calling Triangle from a Fortran       */
 98 | /*  program.                                                                 */
 99 | /*                                                                           */
100 | /*  Description of fields (except the `numberof' fields, which are obvious): */
101 | /*                                                                           */
102 | /*  `pointlist':  An array of point coordinates.  The first point's x        */
103 | /*    coordinate is at index [0] and its y coordinate at index [1], followed */
104 | /*    by the coordinates of the remaining points.  Each point occupies two   */
105 | /*    REALs.                                                                 */
106 | /*  `pointattributelist':  An array of point attributes.  Each point's       */
107 | /*    attributes occupy `numberofpointattributes' REALs.                     */
108 | /*  `pointmarkerlist':  An array of point markers; one int per point.        */
109 | /*                                                                           */
110 | /*  `trianglelist':  An array of triangle corners.  The first triangle's     */
111 | /*    first corner is at index [0], followed by its other two corners in     */
112 | /*    counterclockwise order, followed by any other nodes if the triangle    */
113 | /*    represents a nonlinear element.  Each triangle occupies                */
114 | /*    `numberofcorners' ints.                                                */
115 | /*  `triangleattributelist':  An array of triangle attributes.  Each         */
116 | /*    triangle's attributes occupy `numberoftriangleattributes' REALs.       */
117 | /*  `trianglearealist':  An array of triangle area constraints; one REAL per */
118 | /*    triangle.  Input only.                                                 */
119 | /*  `neighborlist':  An array of triangle neighbors; three ints per          */
120 | /*    triangle.  Output only.                                                */
121 | /*                                                                           */
122 | /*  `segmentlist':  An array of segment endpoints.  The first segment's      */
123 | /*    endpoints are at indices [0] and [1], followed by the remaining        */
124 | /*    segments.  Two ints per segment.                                       */
125 | /*  `segmentmarkerlist':  An array of segment markers; one int per segment.  */
126 | /*                                                                           */
127 | /*  `holelist':  An array of holes.  The first hole's x and y coordinates    */
128 | /*    are at indices [0] and [1], followed by the remaining holes.  Two      */
129 | /*    REALs per hole.  Input only, although the pointer is copied to the     */
130 | /*    output structure for your convenience.                                 */
131 | /*                                                                           */
132 | /*  `regionlist':  An array of regional attributes and area constraints.     */
133 | /*    The first constraint's x and y coordinates are at indices [0] and [1], */
134 | /*    followed by the regional attribute at index [2], followed by the       */
135 | /*    maximum area at index [3], followed by the remaining area constraints. */
136 | /*    Four REALs per area constraint.  Note that each regional attribute is  */
137 | /*    used only if you select the `A' switch, and each area constraint is    */
138 | /*    used only if you select the `a' switch (with no number following), but */
139 | /*    omitting one of these switches does not change the memory layout.      */
140 | /*    Input only, although the pointer is copied to the output structure for */
141 | /*    your convenience.                                                      */
142 | /*                                                                           */
143 | /*  `edgelist':  An array of edge endpoints.  The first edge's endpoints are */
144 | /*    at indices [0] and [1], followed by the remaining edges.  Two ints per */
145 | /*    edge.  Output only.                                                    */
146 | /*  `edgemarkerlist':  An array of edge markers; one int per edge.  Output   */
147 | /*    only.                                                                  */
148 | /*  `normlist':  An array of normal vectors, used for infinite rays in       */
149 | /*    Voronoi diagrams.  The first normal vector's x and y magnitudes are    */
150 | /*    at indices [0] and [1], followed by the remaining vectors.  For each   */
151 | /*    finite edge in a Voronoi diagram, the normal vector written is the     */
152 | /*    zero vector.  Two REALs per edge.  Output only.                        */
153 | /*                                                                           */
154 | /*                                                                           */
155 | /*  Any input fields that Triangle will examine must be initialized.         */
156 | /*  Furthermore, for each output array that Triangle will write to, you      */
157 | /*  must either provide space by setting the appropriate pointer to point    */
158 | /*  to the space you want the data written to, or you must initialize the    */
159 | /*  pointer to NULL, which tells Triangle to allocate space for the results. */
160 | /*  The latter option is preferable, because Triangle always knows exactly   */
161 | /*  how much space to allocate.  The former option is provided mainly for    */
162 | /*  people who need to call Triangle from Fortran code, though it also makes */
163 | /*  possible some nasty space-saving tricks, like writing the output to the  */
164 | /*  same arrays as the input.                                                */
165 | /*                                                                           */
166 | /*  Triangle will not free() any input or output arrays, including those it  */
167 | /*  allocates itself; that's up to you.  You should free arrays allocated by */
168 | /*  Triangle by calling the trifree() procedure defined below.  (By default, */
169 | /*  trifree() just calls the standard free() library procedure, but          */
170 | /*  applications that call triangulate() may replace trimalloc() and         */
171 | /*  trifree() in triangle.c to use specialized memory allocators.)           */
172 | /*                                                                           */
173 | /*  Here's a guide to help you decide which fields you must initialize       */
174 | /*  before you call triangulate().                                           */
175 | /*                                                                           */
176 | /*  `in':                                                                    */
177 | /*                                                                           */
178 | /*    - `pointlist' must always point to a list of points; `numberofpoints'  */
179 | /*      and `numberofpointattributes' must be properly set.                  */
180 | /*      `pointmarkerlist' must either be set to NULL (in which case all      */
181 | /*      markers default to zero), or must point to a list of markers.  If    */
182 | /*      `numberofpointattributes' is not zero, `pointattributelist' must     */
183 | /*      point to a list of point attributes.                                 */
184 | /*    - If the `r' switch is used, `trianglelist' must point to a list of    */
185 | /*      triangles, and `numberoftriangles', `numberofcorners', and           */
186 | /*      `numberoftriangleattributes' must be properly set.  If               */
187 | /*      `numberoftriangleattributes' is not zero, `triangleattributelist'    */
188 | /*      must point to a list of triangle attributes.  If the `a' switch is   */
189 | /*      used (with no number following), `trianglearealist' must point to a  */
190 | /*      list of triangle area constraints.  `neighborlist' may be ignored.   */
191 | /*    - If the `p' switch is used, `segmentlist' must point to a list of     */
192 | /*      segments, `numberofsegments' must be properly set, and               */
193 | /*      `segmentmarkerlist' must either be set to NULL (in which case all    */
194 | /*      markers default to zero), or must point to a list of markers.        */
195 | /*    - If the `p' switch is used without the `r' switch, then               */
196 | /*      `numberofholes' and `numberofregions' must be properly set.  If      */
197 | /*      `numberofholes' is not zero, `holelist' must point to a list of      */
198 | /*      holes.  If `numberofregions' is not zero, `regionlist' must point to */
199 | /*      a list of region constraints.                                        */
200 | /*    - If the `p' switch is used, `holelist', `numberofholes',              */
201 | /*      `regionlist', and `numberofregions' is copied to `out'.  (You can    */
202 | /*      nonetheless get away with not initializing them if the `r' switch is */
203 | /*      used.)                                                               */
204 | /*    - `edgelist', `edgemarkerlist', `normlist', and `numberofedges' may be */
205 | /*      ignored.                                                             */
206 | /*                                                                           */
207 | /*  `out':                                                                   */
208 | /*                                                                           */
209 | /*    - `pointlist' must be initialized (NULL or pointing to memory) unless  */
210 | /*      the `N' switch is used.  `pointmarkerlist' must be initialized       */
211 | /*      unless the `N' or `B' switch is used.  If `N' is not used and        */
212 | /*      `in->numberofpointattributes' is not zero, `pointattributelist' must */
213 | /*      be initialized.                                                      */
214 | /*    - `trianglelist' must be initialized unless the `E' switch is used.    */
215 | /*      `neighborlist' must be initialized if the `n' switch is used.  If    */
216 | /*      the `E' switch is not used and (`in->numberofelementattributes' is   */
217 | /*      not zero or the `A' switch is used), `elementattributelist' must be  */
218 | /*      initialized.  `trianglearealist' may be ignored.                     */
219 | /*    - `segmentlist' must be initialized if the `p' or `c' switch is used,  */
220 | /*      and the `P' switch is not used.  `segmentmarkerlist' must also be    */
221 | /*      initialized under these circumstances unless the `B' switch is used. */
222 | /*    - `edgelist' must be initialized if the `e' switch is used.            */
223 | /*      `edgemarkerlist' must be initialized if the `e' switch is used and   */
224 | /*      the `B' switch is not.                                               */
225 | /*    - `holelist', `regionlist', `normlist', and all scalars may be ignored.*/
226 | /*                                                                           */
227 | /*  `vorout' (only needed if `v' switch is used):                            */
228 | /*                                                                           */
229 | /*    - `pointlist' must be initialized.  If `in->numberofpointattributes'   */
230 | /*      is not zero, `pointattributelist' must be initialized.               */
231 | /*      `pointmarkerlist' may be ignored.                                    */
232 | /*    - `edgelist' and `normlist' must both be initialized.                  */
233 | /*      `edgemarkerlist' may be ignored.                                     */
234 | /*    - Everything else may be ignored.                                      */
235 | /*                                                                           */
236 | /*  After a call to triangulate(), the valid fields of `out' and `vorout'    */
237 | /*  will depend, in an obvious way, on the choice of switches used.  Note    */
238 | /*  that when the `p' switch is used, the pointers `holelist' and            */
239 | /*  `regionlist' are copied from `in' to `out', but no new space is          */
240 | /*  allocated; be careful that you don't free() the same array twice.  On    */
241 | /*  the other hand, Triangle will never copy the `pointlist' pointer (or any */
242 | /*  others); new space is allocated for `out->pointlist', or if the `N'      */
243 | /*  switch is used, `out->pointlist' remains uninitialized.                  */
244 | /*                                                                           */
245 | /*  All of the meaningful `numberof' fields will be properly set; for        */
246 | /*  instance, `numberofedges' will represent the number of edges in the      */
247 | /*  triangulation whether or not the edges were written.  If segments are    */
248 | /*  not used, `numberofsegments' will indicate the number of boundary edges. */
249 | /*                                                                           */
250 | /*****************************************************************************/
251 | 
252 | struct triangulateio {
253 |   float *pointlist;                                               /* In / out */
254 |   float *pointattributelist;                                      /* In / out */
255 |   int *pointmarkerlist;                                          /* In / out */
256 |   int numberofpoints;                                            /* In / out */
257 |   int numberofpointattributes;                                   /* In / out */
258 | 
259 |   int *trianglelist;                                             /* In / out */
260 |   float *triangleattributelist;                                   /* In / out */
261 |   float *trianglearealist;                                         /* In only */
262 |   int *neighborlist;                                             /* Out only */
263 |   int numberoftriangles;                                         /* In / out */
264 |   int numberofcorners;                                           /* In / out */
265 |   int numberoftriangleattributes;                                /* In / out */
266 | 
267 |   int *segmentlist;                                              /* In / out */
268 |   int *segmentmarkerlist;                                        /* In / out */
269 |   int numberofsegments;                                          /* In / out */
270 | 
271 |   float *holelist;                        /* In / pointer to array copied out */
272 |   int numberofholes;                                      /* In / copied out */
273 | 
274 |   float *regionlist;                      /* In / pointer to array copied out */
275 |   int numberofregions;                                    /* In / copied out */
276 | 
277 |   int *edgelist;                                                 /* Out only */
278 |   int *edgemarkerlist;            /* Not used with Voronoi diagram; out only */
279 |   float *normlist;                /* Used only with Voronoi diagram; out only */
280 |   int numberofedges;                                             /* Out only */
281 | };
282 | 
283 | void triangulate(char *,triangulateio *,triangulateio *,triangulateio *);
284 | void trifree(int *memptr);
285 | 
286 | 


--------------------------------------------------------------------------------
/GPU/RUN:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -p cpeg655     
 4 | #SBATCH -N 1
 5 | #SBATCH --gres=gpu:1
 6 | 
 7 | #SBATCH -J ngeneva    
 8 | #SBATCH -e my_job_%j.err
 9 | #SBATCH -o my_job_%j.out
10 | 
11 | srun main


--------------------------------------------------------------------------------
/GPU/elas_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include "elas_gpu.h"
  2 | 
  3 | using namespace std;
  4 | 
  5 | __device__ uint32_t getAddressOffsetImage_GPU (const int32_t& u,const int32_t& v,const int32_t& width) {
  6 |   return v*width+u;
  7 | }
  8 | 
  9 | __device__ uint32_t getAddressOffsetGrid_GPU (const int32_t& x,const int32_t& y,const int32_t& d,const int32_t& width,const int32_t& disp_num) {
 10 |   return (y*width+x)*disp_num+d;
 11 | }
 12 | 
 13 | /**
 14 |  * CUDA Kernel for computing the match for a single UV coordinate
 15 |  */
 16 | __global__ void findMatch_GPU (int32_t* u_vals, int32_t* v_vals, int32_t size_total, float* planes_a, float* planes_b, float* planes_c,
 17 |                          int32_t* disparity_grid, int32_t *grid_dims, uint8_t* I1_desc, uint8_t* I2_desc,
 18 |                          int32_t* P, int32_t plane_radius, int32_t width ,int32_t height, bool* valids, bool right_image, float* D) {
 19 |  
 20 |   // get image width and height
 21 |   const int32_t disp_num    = grid_dims[0]-1;
 22 |   const int32_t window_size = 2;
 23 |   
 24 |   //TODO: Remove hard code and use param
 25 |   bool subsampling = false;
 26 |   bool match_texture = true;
 27 |   int32_t grid_size = 20;
 28 | 
 29 |   // Pixel id
 30 |   uint32_t idx = blockDim.x*blockIdx.x + threadIdx.x;
 31 | 
 32 |   // Check that we are in range
 33 |   if(idx >= size_total)
 34 |     return;
 35 | 
 36 |   // Else get our values from memory
 37 |   uint32_t u = u_vals[idx];
 38 |   uint32_t v = v_vals[idx];
 39 |   float plane_a = planes_a[idx];
 40 |   float plane_b = planes_b[idx];
 41 |   float plane_c = planes_c[idx];
 42 |   bool valid = valids[idx];
 43 | 
 44 |   // address of disparity we want to compute
 45 |   uint32_t d_addr;
 46 |   if (subsampling) d_addr = getAddressOffsetImage_GPU(u/2,v/2,width/2);
 47 |   else             d_addr = getAddressOffsetImage_GPU(u,v,width);
 48 |   
 49 |   // check if u is ok
 50 |   if (u<window_size || u>=width-window_size)
 51 |     return;
 52 | 
 53 |   // compute line start address
 54 |   int32_t  line_offset = 16*width*max(min(v,height-3),2);
 55 |   uint8_t *I1_line_addr,*I2_line_addr;
 56 |   if (!right_image) {
 57 |     I1_line_addr = I1_desc+line_offset;
 58 |     I2_line_addr = I2_desc+line_offset;
 59 |   } else {
 60 |     I1_line_addr = I2_desc+line_offset;
 61 |     I2_line_addr = I1_desc+line_offset;
 62 |   }
 63 | 
 64 |   // compute I1 block start address
 65 |   uint8_t* I1_block_addr = I1_line_addr+16*u;
 66 |   
 67 |   // does this patch have enough texture?
 68 |   int32_t sum = 0;
 69 |   for (int32_t i=0; i<16; i++)
 70 |     sum += abs((int32_t)(*(I1_block_addr+i))-128);
 71 |   if (sum<match_texture)
 72 |     return;
 73 | 
 74 |   // compute disparity, min disparity and max disparity of plane prior
 75 |   int32_t d_plane     = (int32_t)(plane_a*(float)u+plane_b*(float)v+plane_c);
 76 |   int32_t d_plane_min = max(d_plane-plane_radius,0);
 77 |   int32_t d_plane_max = min(d_plane+plane_radius,disp_num-1);
 78 | 
 79 |   // get grid pointer
 80 |   int32_t  grid_x    = (int32_t)floor((float)u/(float)grid_size);
 81 |   int32_t  grid_y    = (int32_t)floor((float)v/(float)grid_size);
 82 |   uint32_t grid_addr = getAddressOffsetGrid_GPU(grid_x,grid_y,0,grid_dims[1],grid_dims[0]);  
 83 |   int32_t  num_grid  = *(disparity_grid+grid_addr);
 84 |   int32_t* d_grid    = disparity_grid+grid_addr+1;
 85 |   
 86 |   // loop variables
 87 |   int32_t d_curr, u_warp, val;
 88 |   int32_t min_val = 10000;
 89 |   int32_t min_d   = -1;
 90 | 
 91 |   // left image
 92 |     for (int32_t i=0; i<num_grid; i++) {
 93 |       d_curr = d_grid[i];
 94 |       if (d_curr<d_plane_min || d_curr>d_plane_max) { //If the current disparity is out of the planes range
 95 |         u_warp = u-d_curr+2*right_image*d_curr; //uwarp diffe
 96 |         if (u_warp<window_size || u_warp>=width-window_size)
 97 |           continue;
 98 |         u_warp = 16*u_warp;
 99 |         val = 0;
100 |         for(int j=0; j<16; j++){
101 |             //val += abs((int32_t)(*(I1_block_addr+j))-(int32_t)(*(I2_line_addr+j+16*u_warp)));
102 |             val = __sad((int)(*(I1_block_addr+j)),(int)(*(I2_line_addr+j+u_warp)),val);
103 |         }
104 |         
105 |         if (val<min_val) {
106 |             min_val = val;
107 |             min_d   = d_curr;
108 |         }
109 |       }
110 |     }
111 |     //disparity inside the grid
112 |     for (d_curr=d_plane_min; d_curr<=d_plane_max; d_curr++) {
113 |             u_warp = u-d_curr+2*right_image*d_curr;
114 |       if (u_warp<window_size || u_warp>=width-window_size)
115 |         continue;
116 |       u_warp = 16*u_warp;
117 |       val = 0;
118 |       for(int j=0; j<16; j++){
119 |           //val += abs((int32_t)(*(I1_block_addr+j))-(int32_t)(*(I2_line_addr+j+16*u_warp)));
120 |           val = __sad((int)(*(I1_block_addr+j)),(int)(*(I2_line_addr+j+u_warp)),val);
121 |       }
122 |       val += valid?*(P+abs(d_curr-d_plane)):0;
123 |       if (val<min_val) {
124 |         min_val = val;
125 |         min_d   = d_curr;
126 |       }
127 |     }
128 | 
129 |   // set disparity value
130 |   if (min_d>=0) *(D+d_addr) = min_d; // MAP value (min neg-Log probability)
131 |   else          *(D+d_addr) = -1;    // invalid disparity
132 | }
133 | 
134 | // implements approximation to 8x8 bilateral filtering
135 | __global__ void adaptiveMeanGPU8 (float* D, int32_t D_width, int32_t D_height) {
136 |   
137 |   // Global coordinates and Pixel id
138 |   uint32_t u0 = blockDim.x*blockIdx.x + threadIdx.x + 4;
139 |   uint32_t v0 = blockDim.y*blockIdx.y + threadIdx.y + 4;
140 |   uint32_t idx = v0*D_width + u0;
141 |   //Local thread coordinates
142 |   uint32_t ut = threadIdx.x + 4;
143 |   uint32_t vt = threadIdx.y + 4;
144 |   
145 |   //If out of filter range return instantly
146 |   if(u0 > (D_width - 4) || v0 > (D_height - 4))
147 |     return;
148 | 
149 |   //Allocate Shared memory array with an appropiate margin for the bitlateral filter
150 |   //Since we are using 8 pixels with the center pixel being 5,
151 |   //we need 4 extra on left and top and 3 extra on right and bottom
152 |   __shared__ float D_shared[32+7][32+7];
153 |   //Populate shared memory
154 |   if(threadIdx.x == blockDim.x-1){
155 |       D_shared[ut+1][vt] = D[idx+1];
156 |       D_shared[ut+2][vt] = D[idx+2];
157 |       D_shared[ut+3][vt] = D[idx+3];
158 |       //D_shared[ut+4][vt] = D[idx+4];
159 |   }
160 |   if(threadIdx.x == 0){
161 |       D_shared[ut-4][vt] = D[idx-4];
162 |       D_shared[ut-3][vt] = D[idx-3];
163 |       D_shared[ut-2][vt] = D[idx-2];
164 |       D_shared[ut-1][vt] = D[idx-1];
165 |   }
166 |   if(threadIdx.y == 0){
167 |       D_shared[ut][vt-4] = D[(v0-4)*D_width+u0];
168 |       D_shared[ut][vt-3] = D[(v0-3)*D_width+u0];
169 |       D_shared[ut][vt-2] = D[(v0-2)*D_width+u0];
170 |       D_shared[ut][vt-1] = D[(v0-1)*D_width+u0];
171 |   }
172 |   if(threadIdx.y == blockDim.y-1){
173 |       D_shared[ut][vt+1] = D[(v0+1)*D_width+u0];
174 |       D_shared[ut][vt+2] = D[(v0+2)*D_width+u0];
175 |       D_shared[ut][vt+3] = D[(v0+3)*D_width+u0];
176 |       //D_shared[ut][vt+4] = D[(v0+4)*D_width+u0];
177 |   }
178 | 
179 |   if(D[idx] < 0){
180 |       // zero input disparity maps to -10 (this makes the bilateral
181 |       // weights of all valid disparities to 0 in this region)
182 |       D_shared[ut][vt] = -10;
183 |   }else{
184 |       D_shared[ut][vt] = D[idx];
185 |   }
186 |   __syncthreads();
187 |       
188 |   // full resolution: 8 pixel bilateral filter width
189 |   // D(x) = sum(I(xi)*f(I(xi)-I(x))*g(xi-x))/W(x)
190 |   // W(x) = sum(f(I(xi)-I(x))*g(xi-x))
191 |   // g(xi-x) = 1
192 |   // f(I(xi)-I(x)) = 4-|I(xi)-I(x)| if greater than 0, 0 otherwise
193 |   // horizontal filter
194 | 
195 |   // Current pixel being filtered is middle of our set (4 back, in orginal its 3 for some reason)
196 |   //Note this isn't truely the center since original uses 8 vectore resisters
197 |   float val_curr = D_shared[ut][vt];
198 | 
199 |   float weight_sum0 = 0;
200 |   float weight_sum = 0;
201 |   float factor_sum = 0;
202 | 
203 |   for(int32_t i=0; i < 8; i++){
204 |     weight_sum0 = 4.0f - fabs(D_shared[ut+(i-4)][vt]-val_curr);
205 |     weight_sum0 = max(0.0f, weight_sum0);
206 |     weight_sum += weight_sum0;
207 |     factor_sum += D_shared[ut+(i-4)][vt]*weight_sum0;
208 |   }
209 | 
210 |   if (weight_sum>0) {
211 |       float d = factor_sum/weight_sum;
212 |       if (d>=0) *(D+idx) = d;
213 |   }
214 |   
215 |   __syncthreads();
216 |   //Update shared memory
217 |   if(threadIdx.x == blockDim.x-1){
218 |       D_shared[ut+1][vt] = D[idx+1];
219 |       D_shared[ut+2][vt] = D[idx+2];
220 |       D_shared[ut+3][vt] = D[idx+3];
221 |       //D_shared[ut+4][vt] = D[idx+4];
222 |   }
223 |   if(threadIdx.x == 0){
224 |       D_shared[ut-4][vt] = D[idx-4];
225 |       D_shared[ut-3][vt] = D[idx-3];
226 |       D_shared[ut-2][vt] = D[idx-2];
227 |       D_shared[ut-1][vt] = D[idx-1];
228 |   }
229 |   if(threadIdx.y == 0){
230 |       D_shared[ut][vt-4] = D[(v0-4)*D_width+u0];
231 |       D_shared[ut][vt-3] = D[(v0-3)*D_width+u0];
232 |       D_shared[ut][vt-2] = D[(v0-2)*D_width+u0];
233 |       D_shared[ut][vt-1] = D[(v0-1)*D_width+u0];
234 |   }
235 |   if(threadIdx.y == blockDim.y-1){
236 |       D_shared[ut][vt+1] = D[(v0+1)*D_width+u0];
237 |       D_shared[ut][vt+2] = D[(v0+2)*D_width+u0];
238 |       D_shared[ut][vt+3] = D[(v0+3)*D_width+u0];
239 |       //D_shared[ut][vt+4] = D[(v0+4)*D_width+u0];
240 |   }
241 | 
242 |   if(D[idx] < 0){
243 |       D_shared[ut][vt] = -10;
244 |   }else{
245 |       D_shared[ut][vt] = D[idx];
246 |   }
247 | 
248 |   __syncthreads();
249 | 
250 |   // vertical filter
251 |   // set pixel of interest
252 |   val_curr = D_shared[ut][vt];
253 | 
254 |   weight_sum0 = 0;
255 |   weight_sum = 0;
256 |   factor_sum = 0;
257 | 
258 |   for(int32_t i=0; i < 8; i++){
259 |     weight_sum0 = 4.0f - fabs(D_shared[ut][vt+(i-4)]-val_curr);
260 |     weight_sum0 = max(0.0f, weight_sum0);
261 |     weight_sum += weight_sum0;
262 |     factor_sum += D_shared[ut][vt+(i-4)]*weight_sum0;
263 |   }
264 | 
265 |   if (weight_sum>0) {
266 |       float d = factor_sum/weight_sum;
267 |       if (d>=0) *(D+idx) = d;
268 |   }
269 | 
270 | }
271 | 
272 | /**
273 |  * This is the core method that computes the disparity of the image
274 |  * It processes each triangle, so we create a kernel and have each thread
275 |  * compute the matches in each triangle
276 |  */
277 | void ElasGPU::computeDisparity(std::vector<support_pt> p_support, std::vector<triangle> tri, int32_t* disparity_grid, int32_t *grid_dims,
278 |                                 uint8_t* I1_desc, uint8_t* I2_desc, bool right_image, float* D) {
279 | 
280 |   // number of disparities
281 |   const int32_t disp_num  = grid_dims[0]-1;
282 |   
283 |   // descriptor window_size
284 |   int32_t window_size = 2;
285 |   
286 |   // init disparity image to -10
287 |   if (param.subsampling) {
288 |     for (int32_t i=0; i<(width/2)*(height/2); i++)
289 |       *(D+i) = -10;
290 |   } else {
291 |     for (int32_t i=0; i<width*height; i++)
292 |       *(D+i) = -10;
293 |   }
294 |   
295 |   // pre-compute prior 
296 |   float two_sigma_squared = 2*param.sigma*param.sigma;
297 |   int32_t* P = new int32_t[disp_num];
298 |   for (int32_t delta_d=0; delta_d<disp_num; delta_d++)
299 |     P[delta_d] = (int32_t)((-log(param.gamma+exp(-delta_d*delta_d/two_sigma_squared))+log(param.gamma))/param.beta);
300 |   int32_t plane_radius = (int32_t)max((float)ceil(param.sigma*param.sradius),(float)2.0);
301 | 
302 |   // loop variables
303 |   int32_t c1, c2, c3, offset;
304 |   float plane_a,plane_b,plane_c,plane_d,milliseconds;
305 | 
306 |   // Size variables
307 |   int32_t size_total = 0;
308 |   int32_t size_grid = width*height;
309 | 
310 |   // Master objects that will need to be created
311 |   // These will be passed to the CUDA kernel after converted
312 |   float* planes_a = new float[size_grid];
313 |   float* planes_b = new float[size_grid];
314 |   float* planes_c = new float[size_grid];
315 |   int32_t* pixs_u = new int32_t[size_grid];
316 |   int32_t* pixs_v = new int32_t[size_grid];
317 |   bool* valids = new bool[size_grid];
318 | 
319 | 
320 |   // for all triangles do
321 |   for (uint32_t i=0; i<tri.size(); i++) {
322 |     
323 |     // get plane parameters
324 |     uint32_t p_i = i*3;
325 |     if (!right_image) {
326 |       plane_a = tri[i].t1a;
327 |       plane_b = tri[i].t1b;
328 |       plane_c = tri[i].t1c;
329 |       plane_d = tri[i].t2a;
330 |     } else {
331 |       plane_a = tri[i].t2a;
332 |       plane_b = tri[i].t2b;
333 |       plane_c = tri[i].t2c;
334 |       plane_d = tri[i].t1a;
335 |     }
336 | 
337 |     // triangle corners
338 |     c1 = tri[i].c1;
339 |     c2 = tri[i].c2;
340 |     c3 = tri[i].c3;
341 | 
342 |     // sort triangle corners wrt. u (ascending)    
343 |     float tri_u[3];
344 |     if (!right_image) {
345 |       tri_u[0] = p_support[c1].u;
346 |       tri_u[1] = p_support[c2].u;
347 |       tri_u[2] = p_support[c3].u;
348 |     } else {
349 |       tri_u[0] = p_support[c1].u-p_support[c1].d;
350 |       tri_u[1] = p_support[c2].u-p_support[c2].d;
351 |       tri_u[2] = p_support[c3].u-p_support[c3].d;
352 |     }
353 |     float tri_v[3] = {p_support[c1].v,p_support[c2].v,p_support[c3].v};
354 |     
355 |     for (uint32_t j=0; j<3; j++) {
356 |       for (uint32_t k=0; k<j; k++) {
357 |         if (tri_u[k]>tri_u[j]) {
358 |           float tri_u_temp = tri_u[j]; tri_u[j] = tri_u[k]; tri_u[k] = tri_u_temp;
359 |           float tri_v_temp = tri_v[j]; tri_v[j] = tri_v[k]; tri_v[k] = tri_v_temp;
360 |         }
361 |       }
362 |     }
363 | 
364 |     // rename corners
365 |     float A_u = tri_u[0]; float A_v = tri_v[0];
366 |     float B_u = tri_u[1]; float B_v = tri_v[1];
367 |     float C_u = tri_u[2]; float C_v = tri_v[2];
368 |     
369 |     // compute straight lines connecting triangle corners
370 |     float AB_a = 0; float AC_a = 0; float BC_a = 0;
371 |     if ((int32_t)(A_u)!=(int32_t)(B_u)) AB_a = (A_v-B_v)/(A_u-B_u);
372 |     if ((int32_t)(A_u)!=(int32_t)(C_u)) AC_a = (A_v-C_v)/(A_u-C_u);
373 |     if ((int32_t)(B_u)!=(int32_t)(C_u)) BC_a = (B_v-C_v)/(B_u-C_u);
374 |     float AB_b = A_v-AB_a*A_u;
375 |     float AC_b = A_v-AC_a*A_u;
376 |     float BC_b = B_v-BC_a*B_u;
377 |     
378 |     // a plane is only valid if itself and its projection
379 |     // into the other image is not too much slanted
380 |     bool valid = fabs(plane_a)<0.7 && fabs(plane_d)<0.7;
381 | 
382 |     // Vector of all u,v pairs we need to calculate
383 |     std::vector<int32_t> temp_val_u = std::vector<int32_t>();
384 |     std::vector<int32_t> temp_val_v = std::vector<int32_t>();
385 |         
386 |     // first part (triangle corner A->B)
387 |     if ((int32_t)(A_u)!=(int32_t)(B_u)) {
388 |       // Starting at A_u loop till the B_u or the end of the image
389 |       for (int32_t u=max((int32_t)A_u,0); u<min((int32_t)B_u,width); u++){
390 |         // If we are sub-sampling skip every two
391 |         if (!param.subsampling || u%2==0) {
392 |           // Use linear lines, to get the bounds of where we need to check
393 |           int32_t v_1 = (uint32_t)(AC_a*(float)u+AC_b);
394 |           int32_t v_2 = (uint32_t)(AB_a*(float)u+AB_b);
395 |           // Loop through these values of v and try to find the match
396 |           for (int32_t v=min(v_1,v_2); v<max(v_1,v_2); v++)
397 |             // If we are sub-sampling skip every two
398 |             if (!param.subsampling || v%2==0) {
399 |               temp_val_u.push_back(u);
400 |               temp_val_v.push_back(v);
401 |             }
402 |         }
403 |       }
404 |     }
405 | 
406 |     // second part (triangle corner B->C)
407 |     if ((int32_t)(B_u)!=(int32_t)(C_u)) {
408 |       for (int32_t u=max((int32_t)B_u,0); u<min((int32_t)C_u,width); u++){
409 |         if (!param.subsampling || u%2==0) {
410 |           int32_t v_1 = (uint32_t)(AC_a*(float)u+AC_b);
411 |           int32_t v_2 = (uint32_t)(BC_a*(float)u+BC_b);
412 |           for (int32_t v=min(v_1,v_2); v<max(v_1,v_2); v++)
413 |             if (!param.subsampling || v%2==0) {
414 |               temp_val_u.push_back(u);
415 |               temp_val_v.push_back(v);
416 |             }
417 |         }
418 |       }
419 |     }
420 | 
421 |     // Append to our master u,v vector
422 |     for(size_t j=0; j<temp_val_u.size(); j++) {
423 | 
424 |       // Set values for our planes
425 |       planes_a[size_total] = plane_a;
426 |       planes_b[size_total] = plane_b;
427 |       planes_c[size_total] = plane_c;
428 | 
429 |       // Pixel u,v coords
430 |       pixs_u[size_total] = temp_val_u.at(j);
431 |       pixs_v[size_total] = temp_val_v.at(j);
432 | 
433 |       // Set if valid
434 |       valids[size_total] = valid;
435 | 
436 |       // Move forward in time
437 |       size_total++;
438 |     }
439 |   }
440 |     
441 |   // Debug
442 |   cout << "Original Size: " << size_grid << endl;
443 |   cout << "Total Size: " << size_total << endl;
444 | 
445 |   // Calculate size of kernel
446 |   int block_size = 32;
447 |   int grid_size = 0;
448 | 
449 |   //Calculate grid_size (add 1 if not evenly divided)
450 |   if(size_total%block_size == 0) {
451 |       grid_size = ceil(size_total/block_size);
452 |   } else {
453 |       grid_size = ceil(size_total/block_size) + 1;
454 |   }
455 | 
456 |   // Create size objects
457 |   dim3 DimGrid(grid_size,1,1);
458 |   dim3 DimBlock(block_size,1,1);
459 | 
460 |   // Allocate u,v pointer array
461 |   int32_t* d_u_vals, *d_v_vals;
462 |   cudaMalloc((void**) &d_u_vals, size_total*sizeof(int32_t));
463 |   cudaMalloc((void**) &d_v_vals, size_total*sizeof(int32_t));
464 | 
465 |   // Copy over pointer array
466 |   cudaMemcpy(d_u_vals, pixs_u, size_total*sizeof(int32_t), cudaMemcpyHostToDevice);
467 |   cudaMemcpy(d_v_vals, pixs_v, size_total*sizeof(int32_t), cudaMemcpyHostToDevice);
468 | 
469 |   // Copy over the plane values
470 |   float* d_planes_a, *d_planes_b, *d_planes_c;
471 |   cudaMalloc((void**) &d_planes_a, size_total*sizeof(float));
472 |   cudaMalloc((void**) &d_planes_b, size_total*sizeof(float));
473 |   cudaMalloc((void**) &d_planes_c, size_total*sizeof(float));
474 |   cudaMemcpy(d_planes_a, planes_a, size_total*sizeof(float), cudaMemcpyHostToDevice);
475 |   cudaMemcpy(d_planes_b, planes_b, size_total*sizeof(float), cudaMemcpyHostToDevice);
476 |   cudaMemcpy(d_planes_c, planes_c, size_total*sizeof(float), cudaMemcpyHostToDevice);
477 |   
478 |   // Copy over valid
479 |   bool* d_valids;
480 |   cudaMalloc((void**) &d_valids, size_total*sizeof(bool));
481 |   cudaMemcpy(d_valids, valids, size_total*sizeof(bool), cudaMemcpyHostToDevice);
482 | 
483 |   // CUDA copy over needed memory information
484 |   // disparity_grid, I1_desc,I2_desc,P,D
485 |   int32_t* d_disparity_grid, *d_grid_dims;
486 |   int32_t* d_P;
487 |   float* d_D;
488 |   uint8_t* d_I1, *d_I2;
489 | 
490 |   // Allocate on global memory
491 |   cudaMalloc((void**) &d_disparity_grid, grid_dims[0]*grid_dims[1]*grid_dims[2]*sizeof(int32_t));
492 |   cudaMalloc((void**) &d_P, disp_num*sizeof(int32_t));
493 |   cudaMalloc((void**) &d_D, width*height*sizeof(float));
494 |   cudaMalloc((void**) &d_I1, 16*width*height*sizeof(uint8_t)); //Device descriptors
495 |   cudaMalloc((void**) &d_I2, 16*width*height*sizeof(uint8_t)); //Device descriptors
496 |   cudaMalloc((void**) &d_grid_dims, 3*sizeof(int32_t)); 
497 | 
498 |   // Now copy over data
499 |   cudaMemcpy(d_disparity_grid, disparity_grid, grid_dims[0]*grid_dims[1]*grid_dims[2]*sizeof(int32_t), cudaMemcpyHostToDevice);
500 |   cudaMemcpy(d_grid_dims, grid_dims, 3*sizeof(int32_t), cudaMemcpyHostToDevice);
501 |   cudaMemcpy(d_P, P, disp_num*sizeof(int32_t), cudaMemcpyHostToDevice);
502 |   cudaMemcpy(d_D, D, width*height*sizeof(float), cudaMemcpyHostToDevice);
503 |   cudaMemcpy(d_I1, I1_desc, 16*width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);
504 |   cudaMemcpy(d_I2, I2_desc, 16*width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);
505 | 
506 |   // Launch the kernel
507 |   findMatch_GPU<<<DimGrid, DimBlock>>>(d_u_vals, d_v_vals, size_total, d_planes_a, d_planes_b, d_planes_c,
508 |                                         d_disparity_grid, d_grid_dims, d_I1, d_I2, d_P, plane_radius,
509 |                                         width, height, d_valids, right_image, d_D);
510 |     
511 |   // Sync after the kernel is launched
512 |   cudaDeviceSynchronize();
513 | 
514 |   // Copy the final disparity values back over
515 |   cudaMemcpy(D, d_D, width*height*sizeof(float), cudaMemcpyDeviceToHost);
516 |   
517 |   // Free local memory
518 |   delete[] P;
519 | 
520 |   // Delete host code
521 |   delete planes_a;
522 |   delete planes_b;
523 |   delete planes_c;
524 |   delete pixs_u;
525 |   delete pixs_v;
526 |   delete valids;
527 | 
528 |   // Free big memory
529 |   cudaFree(d_u_vals);
530 |   cudaFree(d_v_vals);
531 |   cudaFree(d_planes_a);
532 |   cudaFree(d_planes_b);
533 |   cudaFree(d_planes_c);
534 | 
535 |   // Free cuda memory
536 |   cudaFree(d_disparity_grid);
537 |   cudaFree(d_P);
538 |   cudaFree(d_D);
539 |   cudaFree(d_I1);
540 |   cudaFree(d_I2);
541 |   cudaFree(d_grid_dims);
542 |   cudaFree(d_u_vals);
543 |   cudaFree(d_v_vals);
544 | 
545 | }
546 | 
547 | // implements approximation to bilateral filtering
548 | void ElasGPU::adaptiveMean (float* D) {
549 |   
550 |   // get disparity image dimensions
551 |   int32_t D_width          = width;
552 |   int32_t D_height         = height;
553 |   if (param.subsampling) {
554 |     D_width          = width/2;
555 |     D_height         = height/2;
556 |   }
557 |   
558 |   // allocate temporary memory
559 |   float* D_copy = (float*)malloc(D_width*D_height*sizeof(float));
560 |   float* D_tmp  = (float*)malloc(D_width*D_height*sizeof(float));
561 |   memcpy(D_copy,D,D_width*D_height*sizeof(float));
562 |   
563 |   // zero input disparity maps to -10 (this makes the bilateral
564 |   // weights of all valid disparities to 0 in this region)
565 |   for (int32_t i=0; i<D_width*D_height; i++) {
566 |     if (*(D+i)<0) {
567 |       *(D_copy+i) = -10;
568 |       *(D_tmp+i)  = -10;
569 |     }
570 |   }
571 |   
572 |   __m128 xconst0 = _mm_set1_ps(0);
573 |   __m128 xconst4 = _mm_set1_ps(4.0f);
574 |   __m128 xval,xweight1,xweight2,xfactor1,xfactor2;
575 |   
576 |   float *val     = (float *)_mm_malloc(8*sizeof(float),16);
577 |   float *weight  = (float*)_mm_malloc(4*sizeof(float),16);
578 |   float *factor  = (float*)_mm_malloc(4*sizeof(float),16);
579 |   
580 |   // set bitwise absolute value mask
581 |   __m128 xabsmask = _mm_set1_ps(0x7FFFFFFF);
582 |   
583 |   // when doing subsampling: 4 pixel bilateral filter width
584 |   if (param.subsampling) {
585 |   
586 |     // horizontal filter
587 |     for (int32_t v=3; v<D_height-3; v++) {
588 | 
589 |       // init
590 |       for (int32_t u=0; u<3; u++)
591 |         val[u] = *(D_copy+v*D_width+u);
592 | 
593 |       // loop
594 |       for (int32_t u=3; u<D_width; u++) {
595 | 
596 |         // set
597 |         float val_curr = *(D_copy+v*D_width+(u-1));
598 |         val[u%4] = *(D_copy+v*D_width+u);
599 | 
600 |         xval     = _mm_load_ps(val);      
601 |         xweight1 = _mm_sub_ps(xval,_mm_set1_ps(val_curr));
602 |         xweight1 = _mm_and_ps(xweight1,xabsmask);
603 |         xweight1 = _mm_sub_ps(xconst4,xweight1);
604 |         xweight1 = _mm_max_ps(xconst0,xweight1);
605 |         xfactor1 = _mm_mul_ps(xval,xweight1);
606 | 
607 |         _mm_store_ps(weight,xweight1);
608 |         _mm_store_ps(factor,xfactor1);
609 | 
610 |         float weight_sum = weight[0]+weight[1]+weight[2]+weight[3];
611 |         float factor_sum = factor[0]+factor[1]+factor[2]+factor[3];
612 |         
613 |         if (weight_sum>0) {
614 |           float d = factor_sum/weight_sum;
615 |           if (d>=0) *(D_tmp+v*D_width+(u-1)) = d;
616 |         }
617 |       }
618 |     }
619 | 
620 |     // vertical filter
621 |     for (int32_t u=3; u<D_width-3; u++) {
622 | 
623 |       // init
624 |       for (int32_t v=0; v<3; v++)
625 |         val[v] = *(D_tmp+v*D_width+u);
626 | 
627 |       // loop
628 |       for (int32_t v=3; v<D_height; v++) {
629 | 
630 |         // set
631 |         float val_curr = *(D_tmp+(v-1)*D_width+u);
632 |         val[v%4] = *(D_tmp+v*D_width+u);
633 | 
634 |         xval     = _mm_load_ps(val);      
635 |         xweight1 = _mm_sub_ps(xval,_mm_set1_ps(val_curr));
636 |         xweight1 = _mm_and_ps(xweight1,xabsmask);
637 |         xweight1 = _mm_sub_ps(xconst4,xweight1);
638 |         xweight1 = _mm_max_ps(xconst0,xweight1);
639 |         xfactor1 = _mm_mul_ps(xval,xweight1);
640 | 
641 |         _mm_store_ps(weight,xweight1);
642 |         _mm_store_ps(factor,xfactor1);
643 | 
644 |         float weight_sum = weight[0]+weight[1]+weight[2]+weight[3];
645 |         float factor_sum = factor[0]+factor[1]+factor[2]+factor[3];
646 |         
647 |         if (weight_sum>0) {
648 |           float d = factor_sum/weight_sum;
649 |           if (d>=0) *(D+(v-1)*D_width+u) = d;
650 |         }
651 |       }
652 |     }
653 |     
654 |   // full resolution: 8 pixel bilateral filter width
655 |   // D(x) = sum(I(x)*f(I(xi)-I(x))*g(xi-x))/W(x)
656 |   // W(x) = sum(f(I(xi)-I(x))*g(xi-x))
657 |   // g(xi-x) = 1
658 |   // f(I(xi)-I(x)) = 1-(I(xi)-I(x)) if greater than 0, 0 otherwise
659 |   } else {
660 |     
661 |     // Calculate size of kernel
662 |     int block_width = 8;
663 |     int block_height = block_width;
664 |     int grid_width, grid_height;
665 | 
666 |     //Calculate grid_size
667 |     if((width-8)%block_width == 0) {
668 |         grid_width = ceil(width/block_width);
669 |     } else {
670 |         grid_width = ceil(width/block_width); + 1;
671 |     }
672 | 
673 |     if((height-8)%block_height == 0) {
674 |         grid_height = ceil(height/block_height);
675 |     } else {
676 |         grid_height = ceil(height/block_height); + 1;
677 |     }
678 | 
679 |      // Create size objects
680 |     dim3 DimGrid(grid_width,grid_height,1);
681 |     dim3 DimBlock(block_width,block_height,1);
682 | 
683 |     // CUDA copy over needed memory information
684 |     // disparity_grid and respective copies
685 |     float* d_D;
686 | 
687 |     // Allocate on global memory and copy
688 |     cudaMalloc((void**) &d_D, width*height*sizeof(float));
689 |     cudaMemcpy(d_D, D, width*height*sizeof(float), cudaMemcpyHostToDevice);
690 | 
691 |     //Kernel go!
692 |     adaptiveMeanGPU8<<<DimGrid, DimBlock>>>(d_D, width, height);
693 | 
694 |     // Sync after the kernel is launched
695 |     cudaDeviceSynchronize();
696 | 
697 |     // Copy the final disparity values back over
698 |     cudaMemcpy(D, d_D, width*height*sizeof(float), cudaMemcpyDeviceToHost);
699 | 
700 |     //Free memory
701 |     cudaFree(d_D);
702 | 
703 | 
704 |     // horizontal filter
705 |     /*for (int32_t v=3; v<D_height-3; v++) {
706 | 
707 |       // Preload first 7 pixels in row
708 |       for (int32_t u=0; u<7; u++)
709 |         val[u] = *(D_copy+v*D_width+u);
710 | 
711 |       // Loop through remainer of the row
712 |       for (int32_t u=7; u<D_width; u++) {
713 | 
714 |         // Current pixel being filtered is middle of our set (4 back, in orginal its 3 for some reason)
715 |         //Note this isn't truely the center since we have 8 for the vestor registers
716 |         float val_curr = *(D_copy+v*D_width+(u-3));
717 |         // Update the most outdated (farthest away) pixel of our 8
718 |         val[u%8] = *(D_copy+v*D_width+u);
719 | 
720 |         float weight_sum0 = 0;
721 |         float weight_sum2 = 0;
722 |         float factor_sum2 = 0;
723 | 
724 |         for(int32_t i=0; i < 8; i++){
725 |             weight_sum0 = 4.0f - std::fabs(val[i]-val_curr);
726 |             weight_sum0 = std::fmax(0.0f, weight_sum0);
727 |             weight_sum2 += weight_sum0;
728 |             factor_sum2 += val[i]*weight_sum0;
729 |         }
730 | 
731 |         if (weight_sum2>0) {
732 |           float d = factor_sum2/weight_sum2;
733 |           if (d>=0) *(D_tmp+v*D_width+(u-3)) = d;
734 |         }
735 |       }
736 |     }
737 |   
738 |     // vertical filter
739 |     for (int32_t u=3; u<D_width-3; u++) {
740 | 
741 |       // init
742 |       for (int32_t v=0; v<7; v++)
743 |         val[v] = *(D_tmp+v*D_width+u);
744 | 
745 |       // loop
746 |       for (int32_t v=7; v<D_height; v++) {
747 | 
748 |         // set
749 |         float val_curr = *(D_tmp+(v-3)*D_width+u);
750 |         val[v%8] = *(D_tmp+v*D_width+u);
751 | 
752 |         float weight_sum0 = 0.0f;
753 |         float weight_sum2 = 0.0f;
754 |         float factor_sum2 = 0.0f;
755 | 
756 |         for(int32_t i=0; i < 8; i++){
757 |             weight_sum0 = 4.0f - std::fabs(val[i]-val_curr);
758 |             weight_sum0 = std::fmax(0.0f, weight_sum0);
759 |             weight_sum2 += weight_sum0;
760 |             factor_sum2 += val[i]*weight_sum0;
761 |         }
762 | 
763 |         if (weight_sum2>0) {
764 |           float d = factor_sum2/weight_sum2;
765 |           if (d>=0) *(D+(v-3)*D_width+u) = d;
766 |         }
767 |       }
768 |     }*/
769 |   }
770 |   
771 |   // free memory
772 |   _mm_free(val);
773 |   _mm_free(weight);
774 |   _mm_free(factor);
775 |   free(D_copy);
776 |   free(D_tmp);
777 | }


--------------------------------------------------------------------------------
/GPU/elas_gpu.h:
--------------------------------------------------------------------------------
 1 | #ifndef __ELAS_GPU_H__
 2 | #define __ELAS_GPU_H__
 3 | 
 4 | // Enable profiling
 5 | #define PROFILE
 6 | 
 7 | #include <algorithm>
 8 | #include <math.h>
 9 | #include <vector>
10 | #include <cuda.h>
11 | #include <stdint.h>
12 | #include <functional>  
13 | 
14 | #include "elas.h"
15 | #include "descriptor.h"
16 | #include "triangle.h"
17 | #include "matrix.h"
18 | 
19 | 
20 | /**
21 |  * Our ElasGPU class with all cuda implementations
22 |  * Note where we extend the Elas class so we are calling
23 |  * On all non-gpu functions there if they are not implemented
24 |  */
25 | class ElasGPU : public Elas {
26 | 
27 | public:
28 | 
29 |   // Constructor, input: parameters
30 |   // Pass this to the super constructor
31 |   ElasGPU(parameters param) : Elas(param) {}
32 | 
33 | // This was originally "private"
34 | // Was converted to allow sub-classes to call this
35 | // This assumes the user knows what they are doing
36 | public:
37 | 
38 |   void computeDisparity(std::vector<support_pt> p_support,std::vector<triangle> tri,int32_t* disparity_grid,int32_t *grid_dims,
39 |                         uint8_t* I1_desc,uint8_t* I2_desc,bool right_image,float* D);
40 | 
41 |   void adaptiveMean (float* D);
42 | 
43 | };
44 | 
45 | 
46 | #endif //__ELAS_GPU_H__
47 | 


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/aloe_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/aloe_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/aloe_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/aloe_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/cones_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/cones_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/cones_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/cones_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/raindeer_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/raindeer_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/raindeer_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/raindeer_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban1_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban1_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban1_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban1_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban2_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban2_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban2_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban2_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban3_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban3_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban3_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban3_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban4_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban4_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_cpu/urban4_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_cpu/urban4_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/aloe_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/aloe_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/aloe_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/aloe_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/cones_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/cones_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/cones_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/cones_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/raindeer_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/raindeer_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/raindeer_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/raindeer_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban1_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban1_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban1_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban1_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban2_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban2_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban2_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban2_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban3_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban3_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban3_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban3_right_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban4_left_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban4_left_disp.pgm


--------------------------------------------------------------------------------
/GPU_test/2016_12_06_gpu/urban4_right_disp.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/GPU_test/2016_12_06_gpu/urban4_right_disp.pgm


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Nicholas Geneva
 4 | Copyright (c) 2016 Patrick Geneva
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.


--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
 1 | # libelas-gpu
 2 | 
 3 | This is an implementation of the well known [libelas](http://www.cvlibs.net/software/libelas/)
 4 | (LIBrary for Efficient LArge-scale Stereo matching) library for
 5 | sparse large real-time calculation of stereo disparity images.
 6 | This is for a college course final project and thus does not have
 7 | any support of any kind. The original source code can be found
 8 | in the CPU directory. This has been commented as we explore the code.
 9 | 
10 | 
11 | From there, the GPU cuda implementation of key methods can be found
12 | in the GPU folder. For methods not implemented on the GPU the CPU
13 | version is called, and such code is run on the CPU.
14 | 
15 | ## Differences
16 | 
17 | The key difference is the changing of most of the methods in the ELAS class
18 | to being virtual. This allows for the sub-classes to override such methods.
19 | This allows for seamless transition between the GPU and CPU code. Additionally,
20 | the methods where converted to public to allow for the testing and comparison of the two methods directly.
21 | 
22 | ## Building
23 | 
24 | * First create a build directory `mkdir build`
25 | * Move into this directory `cd build`
26 | * Run the c++ [cmake](https://cmake.org/) build system `cmake ..`
27 | * Finally build all the executables `make -j5`
28 | * To build a single one preform the following `make -j5 <name-of-exc>`
29 | * To run the program do the following
30 |     - `./libelas_cpu demo`
31 |     - `./libelas_gpu`
32 | 
33 | 
34 | ## Licenses
35 | 
36 | All the original code is licensed under the **GNU General Public License**. This can be found with the original  [libelas](http://www.cvlibs.net/software/libelas/) library. All other code is licensed under the MIT license, which is attached to this repo's LICENSE file.
37 | 


--------------------------------------------------------------------------------
/input/aloe_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/aloe_left.pgm


--------------------------------------------------------------------------------
/input/aloe_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/aloe_right.pgm


--------------------------------------------------------------------------------
/input/cones_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/cones_left.pgm


--------------------------------------------------------------------------------
/input/cones_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/cones_right.pgm


--------------------------------------------------------------------------------
/input/raindeer_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/raindeer_left.pgm


--------------------------------------------------------------------------------
/input/raindeer_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/raindeer_right.pgm


--------------------------------------------------------------------------------
/input/urban1_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban1_left.pgm


--------------------------------------------------------------------------------
/input/urban1_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban1_right.pgm


--------------------------------------------------------------------------------
/input/urban2_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban2_left.pgm


--------------------------------------------------------------------------------
/input/urban2_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban2_right.pgm


--------------------------------------------------------------------------------
/input/urban3_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban3_left.pgm


--------------------------------------------------------------------------------
/input/urban3_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban3_right.pgm


--------------------------------------------------------------------------------
/input/urban4_left.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban4_left.pgm


--------------------------------------------------------------------------------
/input/urban4_right.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/input/urban4_right.pgm


--------------------------------------------------------------------------------
/main_cpu.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2011. All rights reserved.
  3 | Institute of Measurement and Control Systems
  4 | Karlsruhe Institute of Technology, Germany
  5 | 
  6 | This file is part of libelas.
  7 | Authors: Andreas Geiger
  8 | 
  9 | libelas is free software; you can redistribute it and/or modify it under the
 10 | terms of the GNU General Public License as published by the Free Software
 11 | Foundation; either version 3 of the License, or any later version.
 12 | 
 13 | libelas is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License along with
 18 | libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
 19 | Street, Fifth Floor, Boston, MA 02110-1301, USA 
 20 | */
 21 | 
 22 | // Demo program showing how libelas can be used, try "./elas -h" for help
 23 | #include <iostream>
 24 | #include "elas.h"
 25 | #include "image.h"
 26 | 
 27 | using namespace std;
 28 | 
 29 | 
 30 | // compute disparities of pgm image input pair file_1, file_2
 31 | void process (const char* file_1,const char* file_2) {
 32 | 
 33 |   cout << "Processing: " << file_1 << ", " << file_2 << endl;
 34 | 
 35 |   // load images
 36 |   image<uchar> *I1,*I2;
 37 |   I1 = loadPGM(file_1);
 38 |   I2 = loadPGM(file_2);
 39 | 
 40 |   // check for correct size
 41 |   if (I1->width()<=0 || I1->height() <=0 || I2->width()<=0 || I2->height() <=0 ||
 42 |       I1->width()!=I2->width() || I1->height()!=I2->height()) {
 43 |     cout << "ERROR: Images must be of same size, but" << endl;
 44 |     cout << "       I1: " << I1->width() <<  " x " << I1->height() << 
 45 |                  ", I2: " << I2->width() <<  " x " << I2->height() << endl;
 46 |     delete I1;
 47 |     delete I2;
 48 |     return;    
 49 |   }
 50 | 
 51 |   // get image width and height
 52 |   int32_t width  = I1->width();
 53 |   int32_t height = I1->height();
 54 | 
 55 |   // allocate memory for disparity images
 56 |   const int32_t dims[3] = {width,height,width}; // bytes per line = width
 57 |   float* D1_data = (float*)malloc(width*height*sizeof(float));
 58 |   float* D2_data = (float*)malloc(width*height*sizeof(float));
 59 | 
 60 |   // process
 61 |   Elas::parameters param;
 62 |   param.postprocess_only_left = false;
 63 |   Elas elas(param);
 64 |   elas.process(I1->data,I2->data,D1_data,D2_data,dims);
 65 | 
 66 |   // find maximum disparity for scaling output disparity images to [0..255]
 67 |   float disp_max = 0;
 68 |   for (int32_t i=0; i<width*height; i++) {
 69 |     if (D1_data[i]>disp_max) disp_max = D1_data[i];
 70 |     if (D2_data[i]>disp_max) disp_max = D2_data[i];
 71 |   }
 72 | 
 73 |   // copy float to uchar
 74 |   image<uchar> *D1 = new image<uchar>(width,height);
 75 |   image<uchar> *D2 = new image<uchar>(width,height);
 76 |   for (int32_t i=0; i<width*height; i++) {
 77 |     D1->data[i] = (uint8_t)max(255.0*D1_data[i]/disp_max,0.0);
 78 |     D2->data[i] = (uint8_t)max(255.0*D2_data[i]/disp_max,0.0);
 79 |   }
 80 | 
 81 |   // save disparity images
 82 |   char output_1[1024];
 83 |   char output_2[1024];
 84 |   strncpy(output_1,file_1,strlen(file_1)-4);
 85 |   strncpy(output_2,file_2,strlen(file_2)-4);
 86 |   output_1[strlen(file_1)-4] = '\0';
 87 |   output_2[strlen(file_2)-4] = '\0';
 88 |   strcat(output_1,"_disp.pgm");
 89 |   strcat(output_2,"_disp.pgm");
 90 |   savePGM(D1,output_1);
 91 |   savePGM(D2,output_2);
 92 | 
 93 |   // free memory
 94 |   delete I1;
 95 |   delete I2;
 96 |   delete D1;
 97 |   delete D2;
 98 |   free(D1_data);
 99 |   free(D2_data);
100 | }
101 | 
102 | int main (int argc, char** argv) {
103 | 
104 |   // Run the demo
105 |   // Assume we are running from sub-folder
106 |   if (argc==2 && !strcmp(argv[1],"demo")) {
107 |     process("../input/cones_left.pgm",   "../input/cones_right.pgm");
108 |     process("../input/aloe_left.pgm",    "../input/aloe_right.pgm");
109 |     process("../input/raindeer_left.pgm","../input/raindeer_right.pgm");
110 |     process("../input/urban1_left.pgm",  "../input/urban1_right.pgm");
111 |     process("../input/urban2_left.pgm",  "../input/urban2_right.pgm");
112 |     process("../input/urban3_left.pgm",  "../input/urban3_right.pgm");
113 |     process("../input/urban4_left.pgm",  "../input/urban4_right.pgm");
114 |     cout << "... done!" << endl;
115 | 
116 |   // compute disparity from input pair
117 |   } else if (argc==3) {
118 |     process(argv[1],argv[2]);
119 |     cout << "... done!" << endl;
120 | 
121 |   // display help
122 |   } else {
123 |     cout << endl;
124 |     cout << "ELAS demo program usage: " << endl;
125 |     cout << "./elas demo ................ process all test images (image dir)" << endl;
126 |     cout << "./elas left.pgm right.pgm .. process a single stereo pair" << endl;
127 |     cout << "./elas -h .................. shows this help" << endl;
128 |     cout << endl;
129 |     cout << "Note: All images must be pgm greylevel images. All output" << endl;
130 |     cout << "      disparities will be scaled such that disp_max = 255." << endl;
131 |     cout << endl;
132 |   }
133 | 
134 |   return 0;
135 | }
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/main_gpu.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <iostream>
  3 | #include "elas.h"
  4 | #include "elas_gpu.h"
  5 | #include "image.h"
  6 | 
  7 | using namespace std;
  8 | 
  9 | 
 10 | // Global functions
 11 | void process (const char* file_1,const char* file_2);
 12 | 
 13 | // Enable profiling
 14 | #define PROFILE
 15 | 
 16 | int main(int argc, char** argv) {
 17 | 
 18 |   // Startup the GPU device
 19 |   // https://devtalk.nvidia.com/default/topic/895513/cuda-programming-and-performance/cudamalloc-slow/post/4724457/#4724457
 20 |   cudaFree(0);
 21 | 
 22 |   // Process example frames
 23 |   process("../input/cones_left.pgm",   "../input/cones_right.pgm");
 24 |   process("../input/aloe_left.pgm",    "../input/aloe_right.pgm");
 25 |   process("../input/raindeer_left.pgm","../input/raindeer_right.pgm");
 26 |   process("../input/urban1_left.pgm",  "../input/urban1_right.pgm");
 27 |   process("../input/urban2_left.pgm",  "../input/urban2_right.pgm");
 28 |   process("../input/urban3_left.pgm",  "../input/urban3_right.pgm");
 29 |   process("../input/urban4_left.pgm",  "../input/urban4_right.pgm");
 30 |   cout << "... done!" << endl;
 31 |   
 32 |   // Done!
 33 |   return EXIT_SUCCESS;
 34 | }
 35 | 
 36 | 
 37 | 
 38 | /**
 39 |  * Compute disparities of pgm image input pair file_1, file_2
 40 |  */
 41 | void process (const char* file_1,const char* file_2) {
 42 | 
 43 |   cout << "Processing: " << file_1 << ", " << file_2 << endl;
 44 | 
 45 |   // load images
 46 |   image<uchar> *I1,*I2;
 47 |   I1 = loadPGM(file_1);
 48 |   I2 = loadPGM(file_2);
 49 | 
 50 |   // check for correct size
 51 |   if (I1->width()<=0 || I1->height() <=0 || I2->width()<=0 || I2->height() <=0 ||
 52 |       I1->width()!=I2->width() || I1->height()!=I2->height()) {
 53 |     cout << "ERROR: Images must be of same size, but" << endl;
 54 |     cout << "       I1: " << I1->width() <<  " x " << I1->height() << 
 55 |                  ", I2: " << I2->width() <<  " x " << I2->height() << endl;
 56 |     delete I1;
 57 |     delete I2;
 58 |     return;    
 59 |   }
 60 | 
 61 |   // get image width and height
 62 |   int32_t width  = I1->width();
 63 |   int32_t height = I1->height();
 64 | 
 65 |   // allocate memory for disparity images
 66 |   const int32_t dims[3] = {width,height,width}; // bytes per line = width
 67 |   float* D1_data = (float*)malloc(width*height*sizeof(float));
 68 |   float* D2_data = (float*)malloc(width*height*sizeof(float));
 69 | 
 70 |   // process
 71 |   Elas::parameters param;
 72 |   param.postprocess_only_left = false;
 73 |   //param.subsampling = true;
 74 |   ElasGPU elas(param);
 75 |   elas.process(I1->data,I2->data,D1_data,D2_data,dims);
 76 | 
 77 |   // find maximum disparity for scaling output disparity images to [0..255]
 78 |   float disp_max = 0;
 79 |   for (int32_t i=0; i<width*height; i++) {
 80 |     if (D1_data[i]>disp_max) disp_max = D1_data[i];
 81 |     if (D2_data[i]>disp_max) disp_max = D2_data[i];
 82 |   }
 83 | 
 84 |   // copy float to uchar
 85 |   image<uchar> *D1 = new image<uchar>(width,height);
 86 |   image<uchar> *D2 = new image<uchar>(width,height);
 87 |   for (int32_t i=0; i<width*height; i++) {
 88 |     D1->data[i] = (uint8_t)max(255.0*D1_data[i]/disp_max,0.0);
 89 |     D2->data[i] = (uint8_t)max(255.0*D2_data[i]/disp_max,0.0);
 90 |   }
 91 | 
 92 |   // save disparity images
 93 |   char output_1[1024];
 94 |   char output_2[1024];
 95 |   strncpy(output_1,file_1,strlen(file_1)-4);
 96 |   strncpy(output_2,file_2,strlen(file_2)-4);
 97 |   output_1[strlen(file_1)-4] = '\0';
 98 |   output_2[strlen(file_2)-4] = '\0';
 99 |   strcat(output_1,"_disp.pgm");
100 |   strcat(output_2,"_disp.pgm");
101 |   savePGM(D1,output_1);
102 |   savePGM(D2,output_2);
103 | 
104 |   // free memory
105 |   delete I1;
106 |   delete I2;
107 |   delete D1;
108 |   delete D2;
109 |   free(D1_data);
110 |   free(D2_data);
111 | }


--------------------------------------------------------------------------------
/main_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iostream>
 3 | #include "image.h"
 4 | 
 5 | using namespace std;
 6 | 
 7 | 
 8 | // compute disparities of pgm image input pair file_1, file_2
 9 | void process (const char* file_1,const char* file_2) {
10 | 
11 |   cout << "Processing: " << file_1 << ", " << file_2 << endl;
12 | 
13 |   // load images
14 |   image<uchar> *I1,*I2;
15 |   I1 = loadPGM(file_1);
16 |   I2 = loadPGM(file_2);
17 | 
18 |   // check for correct size
19 |   if (I1->width()<=0 || I1->height() <=0 || I2->width()<=0 || I2->height() <=0 ||
20 |       I1->width()!=I2->width() || I1->height()!=I2->height()) {
21 |     cout << "ERROR: Images must be of same size, but" << endl;
22 |     cout << "       I1: " << I1->width() <<  " x " << I1->height() << 
23 |                  ", I2: " << I2->width() <<  " x " << I2->height() << endl;
24 |     delete I1;
25 |     delete I2;
26 |     return;
27 |   }
28 | 
29 |   // Get image width and height
30 |   int32_t width  = I1->width();
31 |   int32_t height = I1->height();
32 | 
33 |   // Allocate the calculation image matrix
34 |   //image<uchar> *I3(width, height, true);
35 | 
36 |   // Variable for the total error
37 |   double sse = 0;
38 | 
39 |   // Compute the mean squared error between the two images
40 |   // http://stackoverflow.com/a/17237076
41 |   for(int32_t i=0; i<width; i++) {
42 |     for(int32_t j=0; j<height; j++) {
43 |       // cout << index << " - " <<  << endl;
44 |       sse += pow(abs(imRef(I1, i, j) - imRef(I2, i, j)),2);
45 |     }
46 |   }
47 | 
48 |   // MSE = sum((frame1-frame2)^2 ) / no. of pixels
49 |   double mse = sse/ (double)(width*height);
50 | 
51 |   // Print it
52 |   cout << "Mean Squared Error (MSE) = " << mse << endl;
53 |   
54 |   // free memory
55 |   delete I1;
56 |   delete I2;
57 | }
58 | 
59 | int main (int argc, char** argv) {
60 | 
61 |   // Run the comparison between the two images
62 |   if (argc==2 && !strcmp(argv[1],"demo")) {
63 |     // Images (hard coded for dev'ing)
64 |     process("../GPU_test/2016_12_06_cpu/cones_left_disp.pgm",   "../GPU_test/2016_12_06_gpu/cones_left_disp.pgm");
65 |     process("../GPU_test/2016_12_06_cpu/cones_right_disp.pgm",   "../GPU_test/2016_12_06_gpu/cones_right_disp.pgm");
66 |     process("../GPU_test/2016_12_06_cpu/aloe_left_disp.pgm",    "../GPU_test/2016_12_06_gpu/aloe_left_disp.pgm");
67 |     process("../GPU_test/2016_12_06_cpu/aloe_right_disp.pgm",    "../GPU_test/2016_12_06_gpu/aloe_right_disp.pgm");
68 |     process("../GPU_test/2016_12_06_cpu/raindeer_left_disp.pgm","../GPU_test/2016_12_06_gpu/raindeer_left_disp.pgm");
69 |     process("../GPU_test/2016_12_06_cpu/raindeer_right_disp.pgm","../GPU_test/2016_12_06_gpu/raindeer_right_disp.pgm");
70 |     process("../GPU_test/2016_12_06_cpu/urban1_left_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban1_left_disp.pgm");
71 |     process("../GPU_test/2016_12_06_cpu/urban1_right_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban1_right_disp.pgm");
72 |     process("../GPU_test/2016_12_06_cpu/urban2_left_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban2_left_disp.pgm");
73 |     process("../GPU_test/2016_12_06_cpu/urban2_right_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban2_right_disp.pgm");
74 |     process("../GPU_test/2016_12_06_cpu/urban3_left_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban3_left_disp.pgm");
75 |     process("../GPU_test/2016_12_06_cpu/urban3_right_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban3_right_disp.pgm");
76 |     process("../GPU_test/2016_12_06_cpu/urban4_left_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban4_left_disp.pgm");
77 |     process("../GPU_test/2016_12_06_cpu/urban4_right_disp.pgm",  "../GPU_test/2016_12_06_gpu/urban4_right_disp.pgm");
78 |     cout << "... done!" << endl;
79 | 
80 |   // compute disparity from input pair
81 |   } else if (argc==3) {
82 |     process(argv[1],argv[2]);
83 |     cout << "... done!" << endl;
84 | 
85 |   } else {
86 |     cerr << "Please specify the two images you want to compare" << endl;
87 |     cerr << "./main_test <path_cpu_img> <path_gpu_img>" << endl;
88 |   }
89 | 
90 |   return 0;
91 | }
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/references/2010ACCV_Geiger.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/references/2010ACCV_Geiger.pdf


--------------------------------------------------------------------------------
/references/2016IROS_Maddern.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/references/2016IROS_Maddern.pdf


--------------------------------------------------------------------------------
/references/StereoNotes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/references/StereoNotes.pdf


--------------------------------------------------------------------------------
/reports/2016_ELEG655_project_presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/2016_ELEG655_project_presentation.pdf


--------------------------------------------------------------------------------
/reports/2016_ELEG655_project_propsal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/2016_ELEG655_project_propsal.pdf


--------------------------------------------------------------------------------
/reports/2016_ELEG655_project_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/2016_ELEG655_project_report.pdf


--------------------------------------------------------------------------------
/reports/project.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goldbattle/libelas-gpu/3777055bc2eac2976ce39170131fe2c89f5a617e/reports/project.pdf


--------------------------------------------------------------------------------