├── CMakeLists.txt
├── README.md
├── UNLICENSE
└── gbfilter.cc


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | 
 3 | project(gbfilter)
 4 | 
 5 | if(CMAKE_COMPILER_IS_GNUCXX)
 6 |   set(SSE41_FLAGS "-msse4.1 -march=core2")
 7 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fopenmp ${SSE41_FLAGS}")
 8 |   add_definitions("-DGBF_OMP_STATS")
 9 | elseif(MSVC)
10 |   # MSVC's OpenMP implementation is crappy as hell
11 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox")
12 |   add_definitions("/DGBF_OMP_STATS")
13 | else()
14 |   if(${CMAKE_CXX_COMPILER})
15 |     message(WARNING "Unsupported compiler")
16 |   endif()
17 | endif()
18 | 
19 | add_executable(${CMAKE_PROJECT_NAME} gbfilter.cc)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Optimized CPU Gaussian blur filter
 2 | ==================================
 3 | 
 4 | Features :
 5 | * Cache efficient data access
 6 | * Multithreading using OpenMP
 7 | * Vectorization using SSE 4.1 intrinsics
 8 | * Cross-platform (Linux, Windows, OS X)
 9 | * 24bpp uncompressed BMP reader/writer
10 | 
11 | Build :
12 | ```shell
13 | mkdir build; cd build/
14 | cmake ..
15 | cmake --build . --config Release
16 | ```
17 | 
18 | Usage :
19 | ```shell
20 | ./gbfilter input.bmp output.bmp blur_radius tile_width tile_height
21 | ```
22 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/gbfilter.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 | This is free and unencumbered software released into the public domain.
  3 | 
  4 | Anyone is free to copy, modify, publish, use, compile, sell, or
  5 | distribute this software, either in source code form or as a compiled
  6 | binary, for any purpose, commercial or non-commercial, and by any
  7 | means.
  8 | 
  9 | In jurisdictions that recognize copyright laws, the author or authors
 10 | of this software dedicate any and all copyright interest in the
 11 | software to the public domain. We make this dedication for the benefit
 12 | of the public at large and to the detriment of our heirs and
 13 | successors. We intend this dedication to be an overt act of
 14 | relinquishment in perpetuity of all present and future rights to this
 15 | software under copyright law.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 20 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 21 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 23 | OTHER DEALINGS IN THE SOFTWARE.
 24 | 
 25 | For more information, please refer to <http://unlicense.org>
 26 | */
 27 | 
 28 | #include <cstdlib>
 29 | #include <cstdio>
 30 | #include <cmath>
 31 | 
 32 | #ifdef GBF_OMP_STATS
 33 | # include "omp.h"
 34 | # define GET_TIME() omp_get_wtime()
 35 | #else
 36 | # define GET_TIME() 0.0
 37 | #endif
 38 | 
 39 | #if __SSE4_1__
 40 | # include "smmintrin.h"
 41 | #endif
 42 | 
 43 | 
 44 | // =============================================================================
 45 | 
 46 | /**
 47 |  * @name BMPFile
 48 |  * @brief Simple BMP file reader/writer
 49 |  * @note Only accept 24b uncompressed image
 50 |  */
 51 | class BMPFile {
 52 | public:
 53 |   // ---------------------------------------------------------------------------
 54 |   /// @name Public methods
 55 |   // ---------------------------------------------------------------------------
 56 | 
 57 |   BMPFile() :
 58 |     data_(NULL)
 59 |   {}
 60 | 
 61 |   ~BMPFile() {
 62 |     clear_data();
 63 |   }
 64 | 
 65 |   /// Clear pixels data
 66 |   void clear_data() {
 67 |     if (data_) {
 68 |       delete [] data_;
 69 |       data_ = NULL;
 70 |     }
 71 |   }
 72 | 
 73 |   /// Load a BMP from a file
 74 |   /// @param filename : bmp file to load
 75 |   /// @return true if the file was loaded successfully
 76 |   bool load(const char* filename);
 77 | 
 78 |   /// Save a BMP to a file
 79 |   /// @param filename : bmp file to save
 80 |   /// @return true if the file was saved successfully
 81 |   bool save(const char* filename) const;
 82 | 
 83 | 
 84 |   // ---------------------------------------------------------------------------
 85 |   /// @name Getters
 86 |   // ---------------------------------------------------------------------------
 87 | 
 88 |   unsigned int width() const {
 89 |     return bih_.biWidth;
 90 |   }
 91 | 
 92 |   unsigned int height() const {
 93 |     return bih_.biHeight;
 94 |   }
 95 | 
 96 |   unsigned int resolution() const {
 97 |     return width() * height();
 98 |   }
 99 | 
100 |   unsigned char* data() {
101 |     return data_;
102 |   }
103 | 
104 | private:
105 |   // ---------------------------------------------------------------------------
106 |   /// @name Format headers
107 |   // ---------------------------------------------------------------------------
108 | 
109 |   // Tighly pack the structure to avoid unwanted padding
110 | #pragma pack(push, 1)
111 |   struct BITMAPFILEHEADER_t { 
112 |     unsigned short  bfType;             //< filetype, must be "BM" (== 19778)
113 |     unsigned int    bfSize;             //< filesize in bytes
114 |     unsigned short  bfReserved1;        //< reserved, must be 0
115 |     unsigned short  bfReserved2;        //< reserved, must be 0
116 |     unsigned int    bfOffBits;          //< offset in bytes, represent the header size
117 |   };
118 | #pragma pack(pop)
119 | 
120 |   struct BITMAPINFOHEADER_t {
121 |     unsigned int    biSize;             //< structure size, must be 40
122 |     int             biWidth;            //< image width
123 |     int             biHeight;           //< image height. If negative, start TopLeft, BottomLeft otherwise
124 |     unsigned short  biPlanes;           //< planes count, must be 1
125 |     unsigned short  biBitCount;         //< bits per pixel
126 |     unsigned int    biCompression;      //< compression method when height > 0
127 |     unsigned int    biSizeImage;        //< Image size in bytes.
128 |     int             biXPelsPerMeter;    //< horizontal resolution in pixel per meter
129 |     int             biYPelsPerMeter;    //< vertical resolution in pixel per meter
130 |     unsigned int    biClrUsed;          //< number of color used from the color palette
131 |     unsigned int    biClrImportant;     //< number of important index color. All if set to 0
132 |   };
133 | 
134 |   // ---------------------------------------------------------------------------
135 |   /// @name Attributes
136 |   // ---------------------------------------------------------------------------
137 | 
138 |   BITMAPFILEHEADER_t bfh_;              //< bitmap file header
139 |   BITMAPINFOHEADER_t bih_;              //< bitmap info header
140 |   unsigned char *data_;                 //< pixels data
141 | };
142 | 
143 | // -----------------------------------------------------------------------------
144 | 
145 | bool BMPFile::load(const char* filename) {
146 |   clear_data();
147 | 
148 |   // Open the bmp file
149 |   FILE *fd = NULL;
150 |   if (NULL == (fd = fopen(filename, "rb"))) {
151 |     fprintf(stderr, "Error : Unable to open \"%s\".\n", filename);
152 |     return false;
153 |   }
154 | 
155 |   // Read the file headers
156 |   const size_t bfh_res = fread(&bfh_, sizeof(bfh_), 1u, fd);
157 |   const size_t bih_res = fread(&bih_, sizeof(bih_), 1u, fd);
158 |   if (   1u != bfh_res
159 |       || 1u != bih_res) {
160 |     fprintf(stderr, "Error : the file header was not read correctly.\n");
161 |     fclose(fd);
162 |     return false;
163 |   }
164 | 
165 |   // Check file format
166 | #define GBFILTER_BMP_MAGICNUMBER  19778
167 |   if (bfh_.bfType != GBFILTER_BMP_MAGICNUMBER) {
168 |     fprintf(stderr, "Error : invalid file format.\n");
169 |     fclose(fd);
170 |     return false;
171 |   }
172 | #undef GBFILTER_BMP_MAGICNUMBER
173 | 
174 |   // Only 24bit uncompressed images are accepted
175 |   if (bih_.biCompression != 0u) {
176 |     fprintf(stderr, "Compressed BMP files are not handled yet.\n");
177 |     fclose(fd);
178 |     return false;
179 |   }
180 |   if (bih_.biBitCount != 24u) {
181 |     fprintf(stderr, "Non 24bits BMP files are not handled yet.\n");
182 |     fclose(fd);
183 |     return false;
184 |   }
185 | 
186 |   // Create internal buffer
187 |   const unsigned int image_size = bih_.biSizeImage;
188 |   data_ = new unsigned char[image_size];
189 | 
190 |   // Be sure to start at pixels data
191 |   fseek(fd, bfh_.bfOffBits, SEEK_SET);
192 | 
193 |   // Load 24 bit uncompressed data [note: BGR order]  
194 |   if (image_size != fread(data_, 1u, image_size, fd)) {
195 |     fprintf(stderr, "Error : the file was not read correctly.\n");
196 |     fclose(fd);
197 |     return false;
198 |   }
199 | 
200 |   fclose(fd);
201 |   return true;
202 | }
203 | 
204 | // -----------------------------------------------------------------------------
205 | 
206 | bool BMPFile::save(const char* filename) const {
207 |   FILE *fd = NULL;
208 | 
209 |   if (NULL == (fd = fopen(filename, "wb"))) {
210 |     return false;
211 |   }
212 | 
213 |   // headers
214 |   fwrite(&bfh_, sizeof(bfh_), 1u, fd);
215 |   fwrite(&bih_, sizeof(bih_), 1u, fd);
216 |   
217 |   // datas
218 |   fseek(fd, bfh_.bfOffBits, SEEK_SET);
219 |   fwrite(data_, 1u, bih_.biSizeImage, fd);
220 |   
221 |   fclose(fd);
222 | 
223 |   return true;
224 | }
225 | 
226 | // =============================================================================
227 | 
228 | #if __SSE4_1__
229 | /**
230 |  * @struct Vec4
231 |  * @brief Simple Vector4 structure using SSE4.1 intrinsics
232 |  */
233 | struct __attribute__((aligned (16))) Vec4 {
234 |   Vec4() : mmvalue(_mm_setzero_ps()) {}
235 |   Vec4(float x, float y, float z, float w) : mmvalue(_mm_set_ps(w, z, y, x)) {}
236 |   Vec4(float value) : mmvalue(_mm_set1_ps(value)) {}
237 |   Vec4(__m128 mm) : mmvalue(mm) {}
238 | 
239 |   static
240 |   unsigned int GetAlignedSize(unsigned int size) {
241 |     return (size + 3u) / 4u;
242 |   }
243 | 
244 |   static
245 |   float Dot4(const Vec4 &u, const Vec4 &v) {
246 |     return _mm_cvtss_f32(_mm_dp_ps(u.mmvalue, v.mmvalue, 0xF1));
247 |   }
248 | 
249 |   union {
250 |     struct { float x, y, z, w; };
251 |     __m128 mmvalue;
252 |   };
253 | };
254 | #endif
255 | 
256 | // =============================================================================
257 | 
258 | /**
259 |  * @class GBFilter
260 |  * @brief Apply a Gaussian filter on a bitmap file
261 |  *
262 |  * Taking advantages of the symmetric property of the 2D Gaussian filter, the
263 |  * algorithm is splitted in two passes, one horizontale & one verticale, using
264 |  * a simple 1D filter.
265 |  *
266 |  * @note :
267 |  * The blur radius is set in sub-pixel. The fractional part is used to lerp the
268 |  * filter's extremities. The kernel size is hence set as
269 |  * 2 * ceil(blur_radius) + 1
270 |  * As for discrete gaussian, the kernel size is always an odd number.
271 |  */
272 | class GBFilter {
273 | public:
274 |   GBFilter(float blur_radius);
275 |   ~GBFilter();
276 | 
277 |   /// Apply the gaussian filter on a bmp file
278 |   /// @param bmp : bitmap to filter
279 |   /// @param tile_w : tile width
280 |   /// @param tile_h : tile height
281 |   void apply(BMPFile &bmp, unsigned int tile_w, unsigned int tile_h);
282 | 
283 | private:
284 |   // ---------------------------------------------------------------------------
285 |   /// @name Structures
286 |   // ---------------------------------------------------------------------------
287 | 
288 |   /// Buffer storing separated RGB channels in floating point format
289 |   struct RGBBuffer_t {
290 |     RGBBuffer_t() : red(NULL), green(NULL), blue(NULL) {}
291 |     float *red, *green, *blue;
292 |   };
293 | 
294 |   /// Simplify passing constant layout attributes
295 |   struct LayoutParam_t {
296 |     unsigned int image_w;
297 |     unsigned int image_h;
298 |     unsigned int tile_w;
299 |     unsigned int tile_h;
300 |     unsigned int grid_w;
301 |     unsigned int grid_h;
302 |   };
303 | 
304 |   // ---------------------------------------------------------------------------
305 |   /// @name Methods
306 |   // ---------------------------------------------------------------------------
307 |   
308 |   /// Transpose a buffer into another [row to column]
309 |   static
310 |   void TransposeBuffer(const RGBBuffer_t &in,
311 |                              RGBBuffer_t &out,
312 |                              LayoutParam_t &layout);
313 | 
314 |   /// Initialize the 1D gaussian filter
315 |   void init_filter1D();
316 | 
317 |   /// Run the blur passes (horizontal & vertical)
318 |   /// @param layout : layout parameters
319 |   void blur(LayoutParam_t &layout);
320 | 
321 |   /// Apply a horizontal blur on each tiles
322 |   /// @param layout : layout parameters
323 |   void blur_x(const LayoutParam_t &layout);
324 | 
325 |   /// Apply a vertical blur on each tiles
326 |   /// @param layout : layout parameters
327 |   void blur_y(const LayoutParam_t &layout);
328 | 
329 |   /// Generic blur filter pass
330 |   /// @param layout : layout parameters
331 |   /// @param tx : tile x-coordinate
332 |   /// @param ty : tile y-coordinate
333 |   /// @param blurX : Performs horizontal blur if true, vertical otherwise
334 |   /// @param in : input buffer
335 |   /// @param out : output buffer
336 |   void blur_pass(const LayoutParam_t &layout,
337 |                  const unsigned int tx,
338 |                  const unsigned int ty,
339 |                  const bool blurX,
340 |                  const RGBBuffer_t &in,
341 |                        RGBBuffer_t &out);
342 | 
343 |   // ---------------------------------------------------------------------------
344 |   /// @name Attributes
345 |   // ---------------------------------------------------------------------------
346 |   // Kernel radius threshold after which the transpose buffer layout optimization is used
347 |   static const float kTransposeRadiusThreshold;
348 | 
349 |   // number of RGB buffer used
350 |   static const unsigned int kNumRGBBuffer = 2u;
351 | 
352 |   float *filter1D_;                     //< 1D Gaussian filter
353 |   float blur_radius_;                   //< radius of the blur in sub-pixels
354 |   unsigned int kernel_size_;            //< size of the filter, must be odd
355 |   RGBBuffer_t buffer_[kNumRGBBuffer];   //< F32 RGB channels buffers
356 | 
357 | #if __SSE4_1__
358 |   Vec4 *sse_filter_;                    //< 1D Gaussian filter using SSE4.1
359 | #endif
360 | };
361 | 
362 | const float GBFilter::kTransposeRadiusThreshold = 28.0f;
363 | 
364 | // -----------------------------------------------------------------------------
365 | 
366 | GBFilter::GBFilter(float blur_radius) :
367 |   filter1D_(NULL),
368 |   blur_radius_(blur_radius) 
369 | {
370 |   kernel_size_ = 2u * ceilf(blur_radius_) + 1u;
371 |   init_filter1D();
372 | }
373 | 
374 | // -----------------------------------------------------------------------------
375 | 
376 | GBFilter::~GBFilter() {
377 |   if (filter1D_) {
378 |     delete [] filter1D_;
379 | #if __SSE4_1__
380 |     delete [] sse_filter_;
381 | #endif
382 |   }
383 | 
384 |   for (unsigned int i=0u; i<kNumRGBBuffer; ++i) {
385 |     delete [] buffer_[i].red;
386 |     delete [] buffer_[i].green;
387 |     delete [] buffer_[i].blue;
388 |   } 
389 | }
390 | 
391 | // -----------------------------------------------------------------------------
392 | 
393 | void GBFilter::apply(BMPFile &bmp, unsigned int tile_w, unsigned int tile_h) {
394 |   const unsigned int kResolution = bmp.resolution();
395 | 
396 |   // Initialize RGB float buffers
397 |   for (unsigned int i=0u; i<kNumRGBBuffer; ++i) {
398 |     buffer_[i].red   = new float[kResolution];
399 |     buffer_[i].green = new float[kResolution];
400 |     buffer_[i].blue  = new float[kResolution];
401 |   }
402 | 
403 |   unsigned char *pixels = bmp.data();
404 | 
405 |   // setup first blur buffer [uchar to float]
406 |   const float scale = 1.0f / 255.0f;
407 |   for (unsigned int i=0u; i<kResolution; ++i) {
408 |     buffer_[0u].blue[i]  = scale * pixels[3u*i + 0u];
409 |     buffer_[0u].green[i] = scale * pixels[3u*i + 1u];
410 |     buffer_[0u].red[i]   = scale * pixels[3u*i + 2u];
411 | 
412 |     // [debug color]
413 |     buffer_[1u].blue[i]  = 1.0f;
414 |     buffer_[1u].green[i] = 0.0f;
415 |     buffer_[1u].red[i]   = 1.0f;
416 |   }
417 | 
418 |   // Constants layout parameters
419 |   LayoutParam_t layout;
420 |   layout.image_w = bmp.width();
421 |   layout.image_h = bmp.height();
422 |   layout.tile_w  = tile_w;
423 |   layout.tile_h  = tile_h;
424 |   layout.grid_w  = (layout.image_w + layout.tile_w - 1u) / layout.tile_w;
425 |   layout.grid_h  = (layout.image_h + layout.tile_h - 1u) / layout.tile_h;
426 | 
427 |   // Apply blur
428 |   blur(layout);
429 | 
430 |   // Update BMP buffer [float to uchar]
431 |   for (unsigned int i=0u; i<kResolution; ++i) {
432 |     pixels[3u*i + 0u] = (unsigned char)(255 * buffer_[0u].blue[i]);
433 |     pixels[3u*i + 1u] = (unsigned char)(255 * buffer_[0u].green[i]);
434 |     pixels[3u*i + 2u] = (unsigned char)(255 * buffer_[0u].red[i]);
435 |   }
436 | }
437 | 
438 | // -----------------------------------------------------------------------------
439 | 
440 | void GBFilter::init_filter1D() {
441 |   filter1D_ = new float[kernel_size_];
442 | 
443 |   // Base parameters
444 |   const int c = static_cast<int>(kernel_size_ / 2u);
445 |   const float sigma = kernel_size_ / 3.0f; // heuristic
446 |   const float s = 2.0f * sigma * sigma;
447 |   const float inv_s = 1.0f / s;
448 |   const float inv_s_pi = 1.0f / (3.14159265359f * s);
449 | 
450 |   // Calculate the Gaussian coefficients
451 |   float sum = 0.0f;
452 |   for (int x=-c; x<=c; ++x) {
453 |     const float r = x*x;
454 |     const float coeff = exp(-r * inv_s) * inv_s_pi;
455 |     filter1D_[x+c] = coeff;
456 |     sum += coeff;
457 |   }
458 | 
459 |   // Normalize the filter
460 |   const float inv_sum = 1.0f / sum;
461 |   for (unsigned int i=0u; i<kernel_size_; ++i) {
462 |     filter1D_[i] *= inv_sum;
463 |   }
464 | 
465 |   // Lerp kernel boundaries with radius fractional part
466 |   //filter1D_[0] = filter1D_[kernel_size_-1] *= (kernel_radius_ - int(kernel_radius_));
467 | 
468 | #if __SSE4_1__
469 |   /// Create the Vec4 gaussian filter
470 |   const unsigned int nvec = Vec4::GetAlignedSize(kernel_size_);
471 |   sse_filter_ = new Vec4[nvec];
472 | 
473 |   unsigned int i=0u, j=0u;  
474 |   for (; i+1u < nvec; ++i, j+=4u) {
475 |     sse_filter_[i] = Vec4(filter1D_[j], filter1D_[j+1u], filter1D_[j+2u], filter1D_[j+3u]);
476 |   }
477 | 
478 |   sse_filter_[i].x = filter1D_[j];
479 |   sse_filter_[i].y = (j+1u < kernel_size_) ? filter1D_[j+1u] : 0.0f;
480 |   sse_filter_[i].z = (j+2u < kernel_size_) ? filter1D_[j+2u] : 0.0f;
481 |   sse_filter_[i].w = (j+3u < kernel_size_) ? filter1D_[j+3u] : 0.0f;
482 | #endif
483 | }
484 | 
485 | // -----------------------------------------------------------------------------
486 | 
487 | void GBFilter::TransposeBuffer(const RGBBuffer_t &in,
488 |                                      RGBBuffer_t &out,
489 |                                      LayoutParam_t &layout) {
490 | # pragma omp parallel for collapse(2) num_threads(4)
491 |   for (unsigned int x = 0u; x < layout.image_w; ++x) {
492 |     for (unsigned int y = 0u; y < layout.image_h; ++y) {    
493 |       unsigned int  in_idx = y * layout.image_w + x;
494 |       unsigned int out_idx = x * layout.image_h + y;
495 | 
496 |         out.red[out_idx] = in.red[in_idx];
497 |       out.green[out_idx] = in.green[in_idx];
498 |        out.blue[out_idx] = in.blue[in_idx];
499 |     }
500 |   }
501 | 
502 |   // Transpose the layout
503 |   LayoutParam_t tlayout;
504 |   tlayout.image_w = layout.image_h;
505 |   tlayout.image_h = layout.image_w;
506 |   tlayout.tile_w  = layout.tile_h;
507 |   tlayout.tile_h  = layout.tile_w;
508 |   tlayout.grid_w  = layout.grid_h;
509 |   tlayout.grid_h  = layout.grid_w;
510 |   layout = tlayout;
511 | }
512 | 
513 | // -----------------------------------------------------------------------------
514 | 
515 | void GBFilter::blur(LayoutParam_t &layout) {
516 |   // @note
517 |   // Vertical blur is not cache efficient.
518 |   // This is especially noticeable for large blur radius.
519 |   // A work around is to use a buffer to temporary transpose rows as
520 |   // columns and applying a horizontal blur to it before transposing
521 |   // columns back as rows.
522 | 
523 |   double t1 = GET_TIME();
524 | 
525 |   // Horizontal blur
526 |   blur_x(layout);
527 | 
528 |   double t2 = GET_TIME();
529 | 
530 |   // Vertical blur
531 |   if (blur_radius_ < kTransposeRadiusThreshold) {
532 |     blur_y(layout);
533 |   } else {
534 |     TransposeBuffer(buffer_[1u], buffer_[0u], layout);
535 |     blur_x(layout);
536 |     TransposeBuffer(buffer_[1u], buffer_[0u], layout);
537 |   }
538 | 
539 |   double t3 = GET_TIME();
540 | 
541 | #ifdef GBF_OMP_STATS
542 |   fprintf(stderr, "x-blur : %.3f ms\ny-blur : %.3f ms\ntotal : %.3f ms\n",
543 |                   t2-t1, t3-t2, t3-t1);
544 | #endif
545 | }
546 | 
547 | // -----------------------------------------------------------------------------
548 | 
549 | void GBFilter::blur_x(const LayoutParam_t &layout) {
550 | # pragma omp parallel for collapse(2) schedule(dynamic, 1)
551 |   for (unsigned int ty = 0u; ty < layout.grid_h; ++ty) {
552 |     for (unsigned int tx = 0u; tx < layout.grid_w; ++tx) {
553 |       blur_pass(layout, tx, ty, true, buffer_[0u], buffer_[1u]);
554 |     }
555 |   }  
556 | }
557 | 
558 | // -----------------------------------------------------------------------------
559 | 
560 | void GBFilter::blur_y(const LayoutParam_t &layout) {
561 | # pragma omp parallel for collapse(2) schedule(dynamic, 1)
562 |   for (unsigned int ty = 0u; ty < layout.grid_h; ++ty) {
563 |     for (unsigned int tx = 0u; tx < layout.grid_w; ++tx) {
564 |       blur_pass(layout, tx, ty, false, buffer_[1u], buffer_[0u]);
565 |     }
566 |   }
567 | }
568 | 
569 | // -----------------------------------------------------------------------------
570 | 
571 | namespace {
572 | 
573 | /// @return the minimum value between two
574 | inline
575 | unsigned int Min(const unsigned int a, const unsigned int b) {
576 |   return (a < b) ? a : b;
577 | }
578 | 
579 | /// Wrap an index mirrored if outside range [0, width]
580 | /// @param x : index to wrap
581 | /// @param width : range upper boundary
582 | /// @return wrapped index
583 | inline
584 | unsigned int WrappedIndex(const int x, const int width) {
585 |   //const int index = (x < 0) ? -x-1 : (x >= width) ? width-1 : x;
586 |   const int index = (x < 0) ? -x : (x < width) ? x : width-2 + width-x;
587 |   return static_cast<unsigned int>(index);
588 | }
589 | 
590 | /// Get wrapped image index depending on blur offset & directions
591 | inline
592 | unsigned int GetBlurIndex(unsigned int x, unsigned int y, unsigned int w, 
593 |                           unsigned int h, int dx, bool blurX) {
594 |   return (blurX) ? y*w + WrappedIndex(int(x) + dx, w)
595 |                  : WrappedIndex(int(y) + dx, h) * w + x;
596 | }
597 | 
598 | } // namespace
599 | 
600 | // -----------------------------------------------------------------------------
601 | 
602 | void GBFilter::blur_pass(const LayoutParam_t &layout,
603 |                          const unsigned int tx,
604 |                          const unsigned int ty,
605 |                          const bool blurX,
606 |                          const RGBBuffer_t &in,
607 |                                RGBBuffer_t &out) {
608 |   // Discretized kernel radius
609 |   const int c = static_cast<int>(kernel_size_ / 2u);
610 | 
611 |   // Tile start & end index
612 |   const unsigned int start_x = tx * layout.tile_w;
613 |   const unsigned int start_y = ty * layout.tile_h;
614 |   const unsigned int w = layout.image_w;
615 |   const unsigned int h = layout.image_h;
616 |   const unsigned int end_x = Min(start_x + layout.tile_w, w);
617 |   const unsigned int end_y = Min(start_y + layout.tile_h, h);
618 | 
619 |   for (unsigned int y = start_y; y < end_y; ++y) {
620 |     for (unsigned int x = start_x; x < end_x; ++x) {
621 |       unsigned int index = y*layout.image_w + x;
622 | 
623 |       // Pixel blur evaluation
624 |       float rgb[3u] = {0.0f};
625 | #if __SSE4_1__
626 |       for (int dx=-c, cid=0; dx<=c; dx+=4, ++cid) {
627 |         // Gaussian filter coefficients
628 |         const Vec4 &GFC = sse_filter_[cid];
629 | 
630 |         unsigned int i1 = GetBlurIndex(x, y, w, h, dx+0, blurX);
631 |         unsigned int i2 = GetBlurIndex(x, y, w, h, dx+1, blurX);
632 |         unsigned int i3 = GetBlurIndex(x, y, w, h, dx+2, blurX);
633 |         unsigned int i4 = GetBlurIndex(x, y, w, h, dx+3, blurX);
634 |         Vec4 RED(in.red[i1], in.red[i2], in.red[i3], in.red[i4]);
635 |         Vec4 GREEN(in.green[i1], in.green[i2], in.green[i3], in.green[i4]);
636 |         Vec4 BLUE(in.blue[i1], in.blue[i2], in.blue[i3], in.blue[i4]);
637 | 
638 |         rgb[0u] += Vec4::Dot4(GFC, RED);
639 |         rgb[1u] += Vec4::Dot4(GFC, GREEN);
640 |         rgb[2u] += Vec4::Dot4(GFC, BLUE);
641 |       }
642 | #else
643 |       for (int dx=-c; dx<=c; ++dx) {
644 |         float gfc = filter1D_[dx+c];
645 |         unsigned int w_index = GetBlurIndex(x, y, w, h, dx, blurX);
646 | 
647 |         rgb[0u] += gfc * in.red[w_index];
648 |         rgb[1u] += gfc * in.green[w_index];
649 |         rgb[2u] += gfc * in.blue[w_index];
650 |       }
651 | #endif  // __SSE4_1__
652 | 
653 |       // Final pixel, should not need to be clamped
654 |       out.red[index]   = rgb[0u];
655 |       out.green[index] = rgb[1u];
656 |       out.blue[index]  = rgb[2u];
657 |     }
658 |   }
659 | }
660 | 
661 | // =============================================================================
662 | 
663 | int main(int argc, char **argv) {
664 |   // Retrieve command line arguments
665 |   if (argc < 6) {
666 |     fprintf(stderr, "usage :\n%s input_file output_file blur_radius tile_width " \
667 |                     "tile_height\n", argv[0u]);
668 |     exit(EXIT_FAILURE);
669 |   }
670 | 
671 |   char *p_filename_in = argv[1u];
672 |   char *p_filename_out = argv[2u];
673 | 
674 |   float blur_radius(0.0f);
675 |   sscanf(argv[3u], "%f", &blur_radius);
676 | 
677 |   unsigned int tile_w(0u), tile_h(0u);
678 |   sscanf(argv[4u], "%u", &tile_w);
679 |   sscanf(argv[5u], "%u", &tile_h);
680 |  
681 |   // ---------------------------
682 | 
683 |   BMPFile bmp;
684 | 
685 |   // Load the image
686 |   if (!bmp.load(p_filename_in)) {
687 |     exit(EXIT_FAILURE);
688 |   }
689 |   
690 |   // Apply a Gaussian filter to the input image
691 |   GBFilter(blur_radius).apply(bmp, tile_w, tile_h);
692 | 
693 |   // Save the result
694 |   bmp.save(p_filename_out);
695 | 
696 |   return EXIT_SUCCESS;
697 | }
698 | 
699 | // =============================================================================
700 | 


--------------------------------------------------------------------------------