├── CMakeLists.txt ├── README.md ├── UNLICENSE └── gbfilter.cc /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | project(gbfilter) 4 | 5 | if(CMAKE_COMPILER_IS_GNUCXX) 6 | set(SSE41_FLAGS "-msse4.1 -march=core2") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fopenmp ${SSE41_FLAGS}") 8 | add_definitions("-DGBF_OMP_STATS") 9 | elseif(MSVC) 10 | # MSVC's OpenMP implementation is crappy as hell 11 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox") 12 | add_definitions("/DGBF_OMP_STATS") 13 | else() 14 | if(${CMAKE_CXX_COMPILER}) 15 | message(WARNING "Unsupported compiler") 16 | endif() 17 | endif() 18 | 19 | add_executable(${CMAKE_PROJECT_NAME} gbfilter.cc) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Optimized CPU Gaussian blur filter 2 | ================================== 3 | 4 | Features : 5 | * Cache efficient data access 6 | * Multithreading using OpenMP 7 | * Vectorization using SSE 4.1 intrinsics 8 | * Cross-platform (Linux, Windows, OS X) 9 | * 24bpp uncompressed BMP reader/writer 10 | 11 | Build : 12 | ```shell 13 | mkdir build; cd build/ 14 | cmake .. 15 | cmake --build . --config Release 16 | ``` 17 | 18 | Usage : 19 | ```shell 20 | ./gbfilter input.bmp output.bmp blur_radius tile_width tile_height 21 | ``` 22 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /gbfilter.cc: -------------------------------------------------------------------------------- 1 | /* 2 | This is free and unencumbered software released into the public domain. 3 | 4 | Anyone is free to copy, modify, publish, use, compile, sell, or 5 | distribute this software, either in source code form or as a compiled 6 | binary, for any purpose, commercial or non-commercial, and by any 7 | means. 8 | 9 | In jurisdictions that recognize copyright laws, the author or authors 10 | of this software dedicate any and all copyright interest in the 11 | software to the public domain. We make this dedication for the benefit 12 | of the public at large and to the detriment of our heirs and 13 | successors. We intend this dedication to be an overt act of 14 | relinquishment in perpetuity of all present and future rights to this 15 | software under copyright law. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | OTHER DEALINGS IN THE SOFTWARE. 24 | 25 | For more information, please refer to 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #ifdef GBF_OMP_STATS 33 | # include "omp.h" 34 | # define GET_TIME() omp_get_wtime() 35 | #else 36 | # define GET_TIME() 0.0 37 | #endif 38 | 39 | #if __SSE4_1__ 40 | # include "smmintrin.h" 41 | #endif 42 | 43 | 44 | // ============================================================================= 45 | 46 | /** 47 | * @name BMPFile 48 | * @brief Simple BMP file reader/writer 49 | * @note Only accept 24b uncompressed image 50 | */ 51 | class BMPFile { 52 | public: 53 | // --------------------------------------------------------------------------- 54 | /// @name Public methods 55 | // --------------------------------------------------------------------------- 56 | 57 | BMPFile() : 58 | data_(NULL) 59 | {} 60 | 61 | ~BMPFile() { 62 | clear_data(); 63 | } 64 | 65 | /// Clear pixels data 66 | void clear_data() { 67 | if (data_) { 68 | delete [] data_; 69 | data_ = NULL; 70 | } 71 | } 72 | 73 | /// Load a BMP from a file 74 | /// @param filename : bmp file to load 75 | /// @return true if the file was loaded successfully 76 | bool load(const char* filename); 77 | 78 | /// Save a BMP to a file 79 | /// @param filename : bmp file to save 80 | /// @return true if the file was saved successfully 81 | bool save(const char* filename) const; 82 | 83 | 84 | // --------------------------------------------------------------------------- 85 | /// @name Getters 86 | // --------------------------------------------------------------------------- 87 | 88 | unsigned int width() const { 89 | return bih_.biWidth; 90 | } 91 | 92 | unsigned int height() const { 93 | return bih_.biHeight; 94 | } 95 | 96 | unsigned int resolution() const { 97 | return width() * height(); 98 | } 99 | 100 | unsigned char* data() { 101 | return data_; 102 | } 103 | 104 | private: 105 | // --------------------------------------------------------------------------- 106 | /// @name Format headers 107 | // --------------------------------------------------------------------------- 108 | 109 | // Tighly pack the structure to avoid unwanted padding 110 | #pragma pack(push, 1) 111 | struct BITMAPFILEHEADER_t { 112 | unsigned short bfType; //< filetype, must be "BM" (== 19778) 113 | unsigned int bfSize; //< filesize in bytes 114 | unsigned short bfReserved1; //< reserved, must be 0 115 | unsigned short bfReserved2; //< reserved, must be 0 116 | unsigned int bfOffBits; //< offset in bytes, represent the header size 117 | }; 118 | #pragma pack(pop) 119 | 120 | struct BITMAPINFOHEADER_t { 121 | unsigned int biSize; //< structure size, must be 40 122 | int biWidth; //< image width 123 | int biHeight; //< image height. If negative, start TopLeft, BottomLeft otherwise 124 | unsigned short biPlanes; //< planes count, must be 1 125 | unsigned short biBitCount; //< bits per pixel 126 | unsigned int biCompression; //< compression method when height > 0 127 | unsigned int biSizeImage; //< Image size in bytes. 128 | int biXPelsPerMeter; //< horizontal resolution in pixel per meter 129 | int biYPelsPerMeter; //< vertical resolution in pixel per meter 130 | unsigned int biClrUsed; //< number of color used from the color palette 131 | unsigned int biClrImportant; //< number of important index color. All if set to 0 132 | }; 133 | 134 | // --------------------------------------------------------------------------- 135 | /// @name Attributes 136 | // --------------------------------------------------------------------------- 137 | 138 | BITMAPFILEHEADER_t bfh_; //< bitmap file header 139 | BITMAPINFOHEADER_t bih_; //< bitmap info header 140 | unsigned char *data_; //< pixels data 141 | }; 142 | 143 | // ----------------------------------------------------------------------------- 144 | 145 | bool BMPFile::load(const char* filename) { 146 | clear_data(); 147 | 148 | // Open the bmp file 149 | FILE *fd = NULL; 150 | if (NULL == (fd = fopen(filename, "rb"))) { 151 | fprintf(stderr, "Error : Unable to open \"%s\".\n", filename); 152 | return false; 153 | } 154 | 155 | // Read the file headers 156 | const size_t bfh_res = fread(&bfh_, sizeof(bfh_), 1u, fd); 157 | const size_t bih_res = fread(&bih_, sizeof(bih_), 1u, fd); 158 | if ( 1u != bfh_res 159 | || 1u != bih_res) { 160 | fprintf(stderr, "Error : the file header was not read correctly.\n"); 161 | fclose(fd); 162 | return false; 163 | } 164 | 165 | // Check file format 166 | #define GBFILTER_BMP_MAGICNUMBER 19778 167 | if (bfh_.bfType != GBFILTER_BMP_MAGICNUMBER) { 168 | fprintf(stderr, "Error : invalid file format.\n"); 169 | fclose(fd); 170 | return false; 171 | } 172 | #undef GBFILTER_BMP_MAGICNUMBER 173 | 174 | // Only 24bit uncompressed images are accepted 175 | if (bih_.biCompression != 0u) { 176 | fprintf(stderr, "Compressed BMP files are not handled yet.\n"); 177 | fclose(fd); 178 | return false; 179 | } 180 | if (bih_.biBitCount != 24u) { 181 | fprintf(stderr, "Non 24bits BMP files are not handled yet.\n"); 182 | fclose(fd); 183 | return false; 184 | } 185 | 186 | // Create internal buffer 187 | const unsigned int image_size = bih_.biSizeImage; 188 | data_ = new unsigned char[image_size]; 189 | 190 | // Be sure to start at pixels data 191 | fseek(fd, bfh_.bfOffBits, SEEK_SET); 192 | 193 | // Load 24 bit uncompressed data [note: BGR order] 194 | if (image_size != fread(data_, 1u, image_size, fd)) { 195 | fprintf(stderr, "Error : the file was not read correctly.\n"); 196 | fclose(fd); 197 | return false; 198 | } 199 | 200 | fclose(fd); 201 | return true; 202 | } 203 | 204 | // ----------------------------------------------------------------------------- 205 | 206 | bool BMPFile::save(const char* filename) const { 207 | FILE *fd = NULL; 208 | 209 | if (NULL == (fd = fopen(filename, "wb"))) { 210 | return false; 211 | } 212 | 213 | // headers 214 | fwrite(&bfh_, sizeof(bfh_), 1u, fd); 215 | fwrite(&bih_, sizeof(bih_), 1u, fd); 216 | 217 | // datas 218 | fseek(fd, bfh_.bfOffBits, SEEK_SET); 219 | fwrite(data_, 1u, bih_.biSizeImage, fd); 220 | 221 | fclose(fd); 222 | 223 | return true; 224 | } 225 | 226 | // ============================================================================= 227 | 228 | #if __SSE4_1__ 229 | /** 230 | * @struct Vec4 231 | * @brief Simple Vector4 structure using SSE4.1 intrinsics 232 | */ 233 | struct __attribute__((aligned (16))) Vec4 { 234 | Vec4() : mmvalue(_mm_setzero_ps()) {} 235 | Vec4(float x, float y, float z, float w) : mmvalue(_mm_set_ps(w, z, y, x)) {} 236 | Vec4(float value) : mmvalue(_mm_set1_ps(value)) {} 237 | Vec4(__m128 mm) : mmvalue(mm) {} 238 | 239 | static 240 | unsigned int GetAlignedSize(unsigned int size) { 241 | return (size + 3u) / 4u; 242 | } 243 | 244 | static 245 | float Dot4(const Vec4 &u, const Vec4 &v) { 246 | return _mm_cvtss_f32(_mm_dp_ps(u.mmvalue, v.mmvalue, 0xF1)); 247 | } 248 | 249 | union { 250 | struct { float x, y, z, w; }; 251 | __m128 mmvalue; 252 | }; 253 | }; 254 | #endif 255 | 256 | // ============================================================================= 257 | 258 | /** 259 | * @class GBFilter 260 | * @brief Apply a Gaussian filter on a bitmap file 261 | * 262 | * Taking advantages of the symmetric property of the 2D Gaussian filter, the 263 | * algorithm is splitted in two passes, one horizontale & one verticale, using 264 | * a simple 1D filter. 265 | * 266 | * @note : 267 | * The blur radius is set in sub-pixel. The fractional part is used to lerp the 268 | * filter's extremities. The kernel size is hence set as 269 | * 2 * ceil(blur_radius) + 1 270 | * As for discrete gaussian, the kernel size is always an odd number. 271 | */ 272 | class GBFilter { 273 | public: 274 | GBFilter(float blur_radius); 275 | ~GBFilter(); 276 | 277 | /// Apply the gaussian filter on a bmp file 278 | /// @param bmp : bitmap to filter 279 | /// @param tile_w : tile width 280 | /// @param tile_h : tile height 281 | void apply(BMPFile &bmp, unsigned int tile_w, unsigned int tile_h); 282 | 283 | private: 284 | // --------------------------------------------------------------------------- 285 | /// @name Structures 286 | // --------------------------------------------------------------------------- 287 | 288 | /// Buffer storing separated RGB channels in floating point format 289 | struct RGBBuffer_t { 290 | RGBBuffer_t() : red(NULL), green(NULL), blue(NULL) {} 291 | float *red, *green, *blue; 292 | }; 293 | 294 | /// Simplify passing constant layout attributes 295 | struct LayoutParam_t { 296 | unsigned int image_w; 297 | unsigned int image_h; 298 | unsigned int tile_w; 299 | unsigned int tile_h; 300 | unsigned int grid_w; 301 | unsigned int grid_h; 302 | }; 303 | 304 | // --------------------------------------------------------------------------- 305 | /// @name Methods 306 | // --------------------------------------------------------------------------- 307 | 308 | /// Transpose a buffer into another [row to column] 309 | static 310 | void TransposeBuffer(const RGBBuffer_t &in, 311 | RGBBuffer_t &out, 312 | LayoutParam_t &layout); 313 | 314 | /// Initialize the 1D gaussian filter 315 | void init_filter1D(); 316 | 317 | /// Run the blur passes (horizontal & vertical) 318 | /// @param layout : layout parameters 319 | void blur(LayoutParam_t &layout); 320 | 321 | /// Apply a horizontal blur on each tiles 322 | /// @param layout : layout parameters 323 | void blur_x(const LayoutParam_t &layout); 324 | 325 | /// Apply a vertical blur on each tiles 326 | /// @param layout : layout parameters 327 | void blur_y(const LayoutParam_t &layout); 328 | 329 | /// Generic blur filter pass 330 | /// @param layout : layout parameters 331 | /// @param tx : tile x-coordinate 332 | /// @param ty : tile y-coordinate 333 | /// @param blurX : Performs horizontal blur if true, vertical otherwise 334 | /// @param in : input buffer 335 | /// @param out : output buffer 336 | void blur_pass(const LayoutParam_t &layout, 337 | const unsigned int tx, 338 | const unsigned int ty, 339 | const bool blurX, 340 | const RGBBuffer_t &in, 341 | RGBBuffer_t &out); 342 | 343 | // --------------------------------------------------------------------------- 344 | /// @name Attributes 345 | // --------------------------------------------------------------------------- 346 | // Kernel radius threshold after which the transpose buffer layout optimization is used 347 | static const float kTransposeRadiusThreshold; 348 | 349 | // number of RGB buffer used 350 | static const unsigned int kNumRGBBuffer = 2u; 351 | 352 | float *filter1D_; //< 1D Gaussian filter 353 | float blur_radius_; //< radius of the blur in sub-pixels 354 | unsigned int kernel_size_; //< size of the filter, must be odd 355 | RGBBuffer_t buffer_[kNumRGBBuffer]; //< F32 RGB channels buffers 356 | 357 | #if __SSE4_1__ 358 | Vec4 *sse_filter_; //< 1D Gaussian filter using SSE4.1 359 | #endif 360 | }; 361 | 362 | const float GBFilter::kTransposeRadiusThreshold = 28.0f; 363 | 364 | // ----------------------------------------------------------------------------- 365 | 366 | GBFilter::GBFilter(float blur_radius) : 367 | filter1D_(NULL), 368 | blur_radius_(blur_radius) 369 | { 370 | kernel_size_ = 2u * ceilf(blur_radius_) + 1u; 371 | init_filter1D(); 372 | } 373 | 374 | // ----------------------------------------------------------------------------- 375 | 376 | GBFilter::~GBFilter() { 377 | if (filter1D_) { 378 | delete [] filter1D_; 379 | #if __SSE4_1__ 380 | delete [] sse_filter_; 381 | #endif 382 | } 383 | 384 | for (unsigned int i=0u; i(kernel_size_ / 2u); 445 | const float sigma = kernel_size_ / 3.0f; // heuristic 446 | const float s = 2.0f * sigma * sigma; 447 | const float inv_s = 1.0f / s; 448 | const float inv_s_pi = 1.0f / (3.14159265359f * s); 449 | 450 | // Calculate the Gaussian coefficients 451 | float sum = 0.0f; 452 | for (int x=-c; x<=c; ++x) { 453 | const float r = x*x; 454 | const float coeff = exp(-r * inv_s) * inv_s_pi; 455 | filter1D_[x+c] = coeff; 456 | sum += coeff; 457 | } 458 | 459 | // Normalize the filter 460 | const float inv_sum = 1.0f / sum; 461 | for (unsigned int i=0u; i= width) ? width-1 : x; 586 | const int index = (x < 0) ? -x : (x < width) ? x : width-2 + width-x; 587 | return static_cast(index); 588 | } 589 | 590 | /// Get wrapped image index depending on blur offset & directions 591 | inline 592 | unsigned int GetBlurIndex(unsigned int x, unsigned int y, unsigned int w, 593 | unsigned int h, int dx, bool blurX) { 594 | return (blurX) ? y*w + WrappedIndex(int(x) + dx, w) 595 | : WrappedIndex(int(y) + dx, h) * w + x; 596 | } 597 | 598 | } // namespace 599 | 600 | // ----------------------------------------------------------------------------- 601 | 602 | void GBFilter::blur_pass(const LayoutParam_t &layout, 603 | const unsigned int tx, 604 | const unsigned int ty, 605 | const bool blurX, 606 | const RGBBuffer_t &in, 607 | RGBBuffer_t &out) { 608 | // Discretized kernel radius 609 | const int c = static_cast(kernel_size_ / 2u); 610 | 611 | // Tile start & end index 612 | const unsigned int start_x = tx * layout.tile_w; 613 | const unsigned int start_y = ty * layout.tile_h; 614 | const unsigned int w = layout.image_w; 615 | const unsigned int h = layout.image_h; 616 | const unsigned int end_x = Min(start_x + layout.tile_w, w); 617 | const unsigned int end_y = Min(start_y + layout.tile_h, h); 618 | 619 | for (unsigned int y = start_y; y < end_y; ++y) { 620 | for (unsigned int x = start_x; x < end_x; ++x) { 621 | unsigned int index = y*layout.image_w + x; 622 | 623 | // Pixel blur evaluation 624 | float rgb[3u] = {0.0f}; 625 | #if __SSE4_1__ 626 | for (int dx=-c, cid=0; dx<=c; dx+=4, ++cid) { 627 | // Gaussian filter coefficients 628 | const Vec4 &GFC = sse_filter_[cid]; 629 | 630 | unsigned int i1 = GetBlurIndex(x, y, w, h, dx+0, blurX); 631 | unsigned int i2 = GetBlurIndex(x, y, w, h, dx+1, blurX); 632 | unsigned int i3 = GetBlurIndex(x, y, w, h, dx+2, blurX); 633 | unsigned int i4 = GetBlurIndex(x, y, w, h, dx+3, blurX); 634 | Vec4 RED(in.red[i1], in.red[i2], in.red[i3], in.red[i4]); 635 | Vec4 GREEN(in.green[i1], in.green[i2], in.green[i3], in.green[i4]); 636 | Vec4 BLUE(in.blue[i1], in.blue[i2], in.blue[i3], in.blue[i4]); 637 | 638 | rgb[0u] += Vec4::Dot4(GFC, RED); 639 | rgb[1u] += Vec4::Dot4(GFC, GREEN); 640 | rgb[2u] += Vec4::Dot4(GFC, BLUE); 641 | } 642 | #else 643 | for (int dx=-c; dx<=c; ++dx) { 644 | float gfc = filter1D_[dx+c]; 645 | unsigned int w_index = GetBlurIndex(x, y, w, h, dx, blurX); 646 | 647 | rgb[0u] += gfc * in.red[w_index]; 648 | rgb[1u] += gfc * in.green[w_index]; 649 | rgb[2u] += gfc * in.blue[w_index]; 650 | } 651 | #endif // __SSE4_1__ 652 | 653 | // Final pixel, should not need to be clamped 654 | out.red[index] = rgb[0u]; 655 | out.green[index] = rgb[1u]; 656 | out.blue[index] = rgb[2u]; 657 | } 658 | } 659 | } 660 | 661 | // ============================================================================= 662 | 663 | int main(int argc, char **argv) { 664 | // Retrieve command line arguments 665 | if (argc < 6) { 666 | fprintf(stderr, "usage :\n%s input_file output_file blur_radius tile_width " \ 667 | "tile_height\n", argv[0u]); 668 | exit(EXIT_FAILURE); 669 | } 670 | 671 | char *p_filename_in = argv[1u]; 672 | char *p_filename_out = argv[2u]; 673 | 674 | float blur_radius(0.0f); 675 | sscanf(argv[3u], "%f", &blur_radius); 676 | 677 | unsigned int tile_w(0u), tile_h(0u); 678 | sscanf(argv[4u], "%u", &tile_w); 679 | sscanf(argv[5u], "%u", &tile_h); 680 | 681 | // --------------------------- 682 | 683 | BMPFile bmp; 684 | 685 | // Load the image 686 | if (!bmp.load(p_filename_in)) { 687 | exit(EXIT_FAILURE); 688 | } 689 | 690 | // Apply a Gaussian filter to the input image 691 | GBFilter(blur_radius).apply(bmp, tile_w, tile_h); 692 | 693 | // Save the result 694 | bmp.save(p_filename_out); 695 | 696 | return EXIT_SUCCESS; 697 | } 698 | 699 | // ============================================================================= 700 | --------------------------------------------------------------------------------