├── RBF_run64.png ├── RBF_chart32.png ├── RBF_chart64.png ├── RBF_chart64vs32.png ├── images ├── testGirl.jpg ├── Thefarmhouse.jpg └── testpatern5.png ├── stdafx.cpp ├── targetver.h ├── stdafx.h ├── LICENSE ├── RecursiveBilateralFilter.sln ├── RBFilterPlain.h ├── RBFilter_SSE2.h ├── RBFilter_AVX2.h ├── ReadMe.md ├── RecursiveBilateralFilter.vcxproj ├── RBFilterPlain.cpp ├── rbf.hpp ├── RecursiveBilateralFilter.cpp ├── RBFilter_SSE2.cpp ├── stb_image_write.h └── RBFilter_AVX2.cpp /RBF_run64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_run64.png -------------------------------------------------------------------------------- /RBF_chart32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_chart32.png -------------------------------------------------------------------------------- /RBF_chart64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_chart64.png -------------------------------------------------------------------------------- /RBF_chart64vs32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_chart64vs32.png -------------------------------------------------------------------------------- /images/testGirl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/images/testGirl.jpg -------------------------------------------------------------------------------- /images/Thefarmhouse.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/images/Thefarmhouse.jpg -------------------------------------------------------------------------------- /images/testpatern5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/images/testpatern5.png -------------------------------------------------------------------------------- /stdafx.cpp: -------------------------------------------------------------------------------- 1 | // stdafx.cpp : source file that includes just the standard includes 2 | // RecursiveBilateralFilter.pch will be the pre-compiled header 3 | // stdafx.obj will contain the pre-compiled type information 4 | 5 | #include "stdafx.h" 6 | 7 | // TODO: reference any additional headers you need in STDAFX.H 8 | // and not in this file 9 | -------------------------------------------------------------------------------- /targetver.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Including SDKDDKVer.h defines the highest available Windows platform. 4 | 5 | // If you wish to build your application for a previous Windows platform, include WinSDKVer.h and 6 | // set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. 7 | 8 | #include 9 | -------------------------------------------------------------------------------- /stdafx.h: -------------------------------------------------------------------------------- 1 | // stdafx.h : include file for standard system include files, 2 | // or project specific include files that are used frequently, but 3 | // are changed infrequently 4 | // 5 | 6 | #pragma once 7 | 8 | #include "targetver.h" 9 | 10 | // these are needed for the image loader STB 11 | #define STB_IMAGE_IMPLEMENTATION 12 | #define STB_IMAGE_WRITE_IMPLEMENTATION 13 | #define _CRT_SECURE_NO_WARNINGS 14 | 15 | #include 16 | #include 17 | 18 | 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ming 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /RecursiveBilateralFilter.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RecursiveBilateralFilter", "RecursiveBilateralFilter.vcxproj", "{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x64.ActiveCfg = Debug|x64 17 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x64.Build.0 = Debug|x64 18 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x86.ActiveCfg = Debug|Win32 19 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x86.Build.0 = Debug|Win32 20 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x64.ActiveCfg = Release|x64 21 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x64.Build.0 = Release|x64 22 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x86.ActiveCfg = Release|Win32 23 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /RBFilterPlain.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | // This class is useful only for the sake of understanding the main principles of Recursive Bilateral Filter 5 | // It is designed in non-optimal but easy to understand way. It also does not match 1:1 with original, 6 | // some creative liberties were taken with original idea. 7 | // This class is not used in performance tests 8 | 9 | class CRBFilterPlain 10 | { 11 | int m_reserve_width = 0; 12 | int m_reserve_height = 0; 13 | int m_reserve_channels = 0; 14 | 15 | float* m_left_pass_color = nullptr; 16 | float* m_left_pass_factor = nullptr; 17 | 18 | float* m_right_pass_color = nullptr; 19 | float* m_right_pass_factor = nullptr; 20 | 21 | float* m_down_pass_color = nullptr; 22 | float* m_down_pass_factor = nullptr; 23 | 24 | float* m_up_pass_color = nullptr; 25 | float* m_up_pass_factor = nullptr; 26 | 27 | int getDiffFactor(const unsigned char* color1, const unsigned char* color2) const; 28 | 29 | public: 30 | 31 | CRBFilterPlain(); 32 | ~CRBFilterPlain(); 33 | 34 | // assumes 3/4 channel images, 1 byte per channel 35 | void reserveMemory(int max_width, int max_height, int channels); 36 | void releaseMemory(); 37 | 38 | // memory must be reserved before calling image filter 39 | // this implementation of filter uses plain C++, single threaded 40 | // channel count must be 3 or 4 (alpha not used) 41 | void filter(unsigned char* img_src, unsigned char* img_dst, 42 | float sigma_spatial, float sigma_range, 43 | int width, int height, int channel); 44 | }; -------------------------------------------------------------------------------- /RBFilter_SSE2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Optimized SSE2 implementation of Recursive Bilateral Filter 4 | // 5 | 6 | #include 7 | 8 | #define RBF_MAX_THREADS 8 9 | #define STAGE_BUFFER_COUNT 3 10 | 11 | class CRBFilterSSE2 12 | { 13 | int m_reserved_width = 0; 14 | int m_reserved_height = 0; 15 | int m_thread_count = 0; 16 | bool m_pipelined = false; 17 | 18 | float m_sigma_spatial = 0.f; 19 | float m_sigma_range = 0.f; 20 | float m_inv_alpha_f = 0.f; 21 | float* m_range_table = nullptr; 22 | 23 | int m_filter_counter = 0; // used in pipelined mode 24 | unsigned char* m_stage_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // size width * height * 4, 2nd one null if not pipelined 25 | float** m_h_line_cache = nullptr; // single line cache for horizontal filter pass, one per thread 26 | float** m_v_line_cache = nullptr; // if not pipelined mode, this is equal to 'm_h_line_cache' 27 | unsigned char* m_out_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // used for keeping track of current output buffer in pipelined mode 28 | int m_image_width = 0; // cache of sizes for pipelined mode 29 | int m_image_height = 0; 30 | int m_image_pitch = 0; 31 | 32 | std::future m_horizontal_tasks[RBF_MAX_THREADS]; 33 | std::future m_vertical_tasks[RBF_MAX_THREADS]; 34 | 35 | // core filter functions 36 | void horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch); 37 | void verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch); 38 | 39 | public: 40 | 41 | CRBFilterSSE2(); 42 | ~CRBFilterSSE2(); 43 | 44 | // 'sigma_spatial' - unlike the original implementation of Recursive Bilateral Filter, 45 | // the value if sigma_spatial is not influence by image width/height. 46 | // In this implementation, sigma_spatial is assumed over image width 255, height 255 47 | void setSigma(float sigma_spatial, float sigma_range); 48 | 49 | // Source and destination images are assumed to be 4 component 50 | // 'width' - maximum image width 51 | // 'height' - maximum image height 52 | // 'thread_count' - total thread count to use for each filter stage (horizontal and vertical), recommended thread count = 4 53 | // 'pipelined' - if true, then horizontal and vertical filter passes are split into separate stages, 54 | // where each stage uses 'thread_count' of threads (so basically double) 55 | // Return true if successful, had very basic error checking 56 | bool initialize(int width, int height, int thread_count = 1, bool pipelined = false); 57 | 58 | // de-initialize, free memory 59 | void release(); 60 | 61 | // synchronous filter function, returns only when everything finished, goes faster if there's multiple threads 62 | // initialize() and setSigma() should be called before this 63 | // 'out_data' - output image buffer, assumes 4 byte per pixel 64 | // 'in_data' - input image buffer, assumes 4 byte per pixel 65 | // 'width' - width of both input and output buffers, must be same for both 66 | // 'height' - height of both input and output buffers, must be same for both 67 | // 'pitch' - row size in bytes, must be same for both buffers (ideally, this should be divisible by 16) 68 | // return false if failed for some reason 69 | bool filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch); 70 | 71 | // asynchronous, pipelined filter function 72 | // pipeline consists of 2 stages, one for horizontal filter, other for vertical filter 73 | // this is useful for video filtering where 1-2 frame delay is acceptable 74 | // for simplicity of this sample implementation, input and output data buffers must remain valid until filtering is finished 75 | // since it's 2 stage pipeline, consecutive calls should submit alternating buffers (2 sets of input and output buffers) 76 | // This function blocks until 1st stage finishes from previous call 77 | bool filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch); 78 | // this function blocks until both stages finished all processing 79 | // it should always be used to get last frame 80 | void filterPipeFlush(); 81 | }; -------------------------------------------------------------------------------- /RBFilter_AVX2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Optimized SSE2 implementation of Recursive Bilateral Filter 4 | // 5 | 6 | #include 7 | 8 | #define RBF_MAX_THREADS 8 9 | #define STAGE_BUFFER_COUNT 3 10 | 11 | class CRBFilterAVX2 12 | { 13 | int m_reserved_width = 0; 14 | int m_reserved_height = 0; 15 | int m_thread_count = 0; 16 | bool m_pipelined = false; 17 | 18 | float m_sigma_spatial = 0.f; 19 | float m_sigma_range = 0.f; 20 | float m_inv_alpha_f = 0.f; 21 | float* m_range_table = nullptr; 22 | 23 | int m_filter_counter = 0; // used in pipelined mode 24 | unsigned char* m_stage_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // size width * height * 4, others are null if not pipelined 25 | float** m_h_line_cache = nullptr; // line cache for horizontal filter pass, 1 per thread 26 | float** m_v_line_cache = nullptr; // line cache for vertical filter pass, 1 per thread 27 | unsigned char* m_out_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // used for keeping track of current output buffer in pipelined mode 28 | int m_image_width = 0; // cache of sizes for pipelined mode 29 | int m_image_height = 0; 30 | int m_image_pitch = 0; 31 | 32 | std::future m_horizontal_tasks[RBF_MAX_THREADS]; 33 | std::future m_vertical_tasks[RBF_MAX_THREADS]; 34 | 35 | // core filter functions 36 | void horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch); 37 | void verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch); 38 | 39 | public: 40 | 41 | CRBFilterAVX2(); 42 | ~CRBFilterAVX2(); 43 | 44 | // given specified image width, return optimal row size in bytes that has been rounded up to better fit YMM registers 45 | // image buffers should use this pitch for input and output 46 | int getOptimalPitch(int width) const; 47 | 48 | // 'sigma_spatial' - unlike the original implementation of Recursive Bilateral Filter, 49 | // the value if sigma_spatial is not influence by image width/height. 50 | // In this implementation, sigma_spatial is assumed over image width 255, height 255 51 | void setSigma(float sigma_spatial, float sigma_range); 52 | 53 | // Source and destination images are assumed to be 4 component 54 | // 'width' - maximum image width 55 | // 'height' - maximum image height 56 | // 'thread_count' - total thread count to use for each filter stage (horizontal and vertical), recommended thread count = 4 57 | // 'pipelined' - if true, then horizontal and vertical filter passes are split into separate stages, 58 | // where each stage uses 'thread_count' of threads (so basically double) 59 | // Return true if successful, had very basic error checking 60 | bool initialize(int width, int height, int thread_count = 1, bool pipelined = false); 61 | 62 | // de-initialize, free memory 63 | void release(); 64 | 65 | // synchronous filter function, returns only when everything finished, goes faster if there's multiple threads 66 | // initialize() and setSigma() should be called before this 67 | // 'out_data' - output image buffer, assumes 4 byte per pixel 68 | // 'in_data' - input image buffer, assumes 4 byte per pixel 69 | // 'width' - width of both input and output buffers, must be same for both 70 | // 'height' - height of both input and output buffers, must be same for both 71 | // 'pitch' - row size in bytes, must be same for both buffers (ideally, this should be divisible by 16) 72 | // return false if failed for some reason 73 | bool filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch); 74 | 75 | // asynchronous, pipelined filter function 76 | // pipeline consists of 2 stages, one for horizontal filter, other for vertical filter 77 | // this is useful for video filtering where 1-2 frame delay is acceptable 78 | // for simplicity of this sample implementation, input and output data buffers must remain valid until filtering is finished 79 | // since it's 2 stage pipeline, consecutive calls should submit alternating buffers (2 sets of input and output buffers) 80 | // This function blocks until 1st stage finishes from previous call 81 | bool filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch); 82 | // this function blocks until both stages finished all processing 83 | // it should always be used to get last frame 84 | void filterPipeFlush(); 85 | }; -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # Optimized Recursive Bilateral Filter 2 | 3 | This project is a derivative work based on this project: 4 | https://github.com/ufoym/RecursiveBF 5 | 6 | The main purpose of this project is to provide a more optimized implementation of the edge preserving Recursive Bilateral Filter. For more information about the image filter, see the link above 7 | 8 | This project was made with VS2015 on Windows platform, but it should be easy to port if necessary. There aren't many files and I don't think any Windows specific functions were used. 9 | 10 | Optimization is based on 3 categories: reducing memory usage, adding multithreading, adding SSE2 / AVX2 C++ intrinsics 11 | 12 | * Memory usage: in original implementation, memory usage of RGB32 or RGBA image would be roughtly = width * height * 40 + width * 40. In optimized implemention, it is roughly = width * height * 4 + width * 80 for non-piplined version. And width * height * 12 + width * 80 for pipelined. In general, almost 10x less memory allocation 13 | 14 | * Multithreading: original implementation is written as single threaded solution, and in a way that it not easy split into threads. Optimized solution is multithread friendly because it separates the filter into 2 stages, one for horizontal filter pass, other for vertical filter pass. Each filter pass can then be subdivided into user chosen number of threads. For horizontal filter, each thread handles its own row from original data buffer, while for vertical pass, each thread handles its own column block 15 | 16 | * SSE2 and AVX2: original implementation is written in basic C++ and while it is possible to select SSE2 or AVX2 optimization guidelines in compiler, the generated code does not properly take advantage of that functionality. Optimized solution provides 2 separate implementations, one written almost exclusively with SSE2 intrinsics, another almost exclusively with AVX2 intrinsics, so the compiler can utilize their capabilities much more effectively. 17 | 18 | It's important to mention that this optimized implementation has some fundamental differences with the original. Those are: 19 | 20 | * Only images with 4 bytes per pixel are accepted, this means RGB32 or RGBA. With some light modifications, it would be possible to adapt it to work with RGB24, single channel, or YUV 422 (2 pixels in 4 bytes) 21 | 22 | * Edge detection algorithm is different. In original version, 3 components (RBG) of 2 adjacent pixels are evaluated for absolute difference, then 2 of those absolute differences are divided by 4 and added together, 3rd component is divided by 2 and added to the sum. The goal is to get absolute difference between 2 pixels in 0-255 range, but this solution makes one of the components have 2x significance of other 2. Optimized solution offers 2 alternative options (chosen which compiler flag): either get maximum of absolute differences between 3 components (stronger blur) or get 255 saturated sum of absolute differences of 3 components (weaker blur). Both methods have equal cost. The value of 4th component (alpha) is not taken into account, but it would be easy to do so if needed 23 | 24 | * Sigma Spatial: in original implementation, sigma spatial, one of the 2 blur parameters, has depedency on image width and height. That means the same value would yield different amount of blur based on size of the image. Optimized solution removes that dependency by anchoring sigma spatiel to arbitrary value of 255, making it uniform for both width and height. 25 | 26 | For testing purposes, 3 images were chosen: 27 | 28 | * testGirl.jpg - smallest image, 448 x 626, it is the same image used in original implementation 29 | * Thefarmhouse.jpg - larger image, 1440 x 1080, it is a painting with lots of small noise that can be blurred. 30 | * testpatern5.png - full HD image, 1920 x 1080, it is a test pattern that has no noise and sharp edges. It is useful for purpose of verifying that edge preserving image filter has minimum impact on the edges 31 | 32 | Here's what built 64 bit application output looks like on Intel i7-4700HQ (~2.4 GHz): 33 | 34 | ![alt text](./RBF_run64.png "64 bit application") 35 | 36 | Image paths and blur strength (sigma) values are hardcoded, at top of RecursiveBilateralFilter.cpp 37 | When application runs, it saves filtered image under generated name in same folder as original images 38 | 39 | Here is the same data in chart form, so it's easier to understand (time is in ms): 40 | 41 | ![alt text](./RBF_chart64.png "64 bit chart") 42 | 43 | It's interesting to note that the same application compiled as 32 bit performs significantly slower, especially for the original function 44 | 45 | ![alt text](./RBF_chart32.png "32 bit chart") 46 | 47 | Here's direct comparison of 64 bit vs 32 bit for full HD image 48 | 49 | ![alt text](./RBF_chart64vs32.png "64 vs 32 bit chart") 50 | 51 | Optimized solution provides 2 filter functions, one is designed for synchronous use - when multithreading is enabled, the function splits its work among threads and waits until they finish. Other filter function is asynchronous "push pipeline" mode, it divides task in 2 stages, horizontal filter pass and vertical filter pass. When horizontal pass is finished, it can start on next image while vertical pass starts on results of horizontal pass. 52 | Further optimizations with multithreading are possible, current implementation is provided as simple example. 53 | 54 | Most of the focus of this project is on utilization of XMM and YMM registers with SSE2 and AVX2 intrinsic functions. From the charts above, it is clear that even single threaded solution offers considerable speed up over original. It's also interesting to note that additional multithreading has diminishing returns, especially for small images. 55 | 56 | SSE2 based filter solution was implemented to work with unaligned image buffers, while AVX2 requires input and output buffers to follow 32 byte alignment. It is possible to remove or relax that requirement with minor modications, there is not a significant penalty of working with unaligned memory for read operations, but write operations would need a few extra instructions and generally make for messier code 57 | 58 | This project also provides a simple unoptimized C++ implementation of the Recursive Bilateral Filter in files RBFilterPlain.h, RBFilterPlain.cpp. This implementation does not participate in tests and it is only useful for the purposes of helping to understand the core of the algorithm. It's also useful for tinkering with filter design 59 | 60 | In conclusion, the most optimized implementation of Recursive Bilateral Filter is able to achieve roughtly 10x speed up over original (slightly less) 61 | 62 | It is even possible to process full HD video at 60 fps, with some room to spare on CPU (tho not much). For video processing, it would be best to add YUV 420 support, which is somewhat more involved due to its planar format. 63 | 64 | -------------------------------------------------------------------------------- /RecursiveBilateralFilter.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {B003F67D-4A02-47C4-B0EC-E1A7BDC62663} 23 | Win32Proj 24 | RecursiveBilateralFilter 25 | 8.1 26 | 27 | 28 | 29 | Application 30 | true 31 | v140 32 | Unicode 33 | 34 | 35 | Application 36 | false 37 | v140 38 | true 39 | Unicode 40 | 41 | 42 | Application 43 | true 44 | v140 45 | Unicode 46 | 47 | 48 | Application 49 | false 50 | v140 51 | true 52 | Unicode 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | true 74 | 75 | 76 | true 77 | 78 | 79 | false 80 | 81 | 82 | false 83 | 84 | 85 | 86 | Use 87 | Level3 88 | Disabled 89 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | true 91 | AdvancedVectorExtensions2 92 | 93 | 94 | Console 95 | true 96 | 97 | 98 | 99 | 100 | Use 101 | Level3 102 | Disabled 103 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 104 | true 105 | AdvancedVectorExtensions2 106 | 107 | 108 | Console 109 | true 110 | 111 | 112 | 113 | 114 | Level3 115 | Use 116 | MaxSpeed 117 | true 118 | true 119 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 120 | true 121 | AdvancedVectorExtensions2 122 | Speed 123 | 124 | 125 | Console 126 | true 127 | true 128 | true 129 | 130 | 131 | 132 | 133 | Level3 134 | Use 135 | MaxSpeed 136 | true 137 | true 138 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 139 | true 140 | AdvancedVectorExtensions2 141 | Speed 142 | 143 | 144 | Console 145 | true 146 | true 147 | true 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | Create 170 | Create 171 | Create 172 | Create 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /RBFilterPlain.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "RBFilterPlain.h" 3 | #include "stdafx.h" 4 | #include "RBFilterPlain.h" 5 | #include 6 | 7 | using namespace std; 8 | 9 | #define QX_DEF_CHAR_MAX 255 10 | 11 | 12 | CRBFilterPlain::CRBFilterPlain() 13 | { 14 | 15 | } 16 | 17 | CRBFilterPlain::~CRBFilterPlain() 18 | { 19 | releaseMemory(); 20 | } 21 | 22 | // assumes 3/4 channel images, 1 byte per channel 23 | void CRBFilterPlain::reserveMemory(int max_width, int max_height, int channels) 24 | { 25 | // basic sanity check 26 | _ASSERT(max_width >= 10 && max_width < 10000); 27 | _ASSERT(max_height >= 10 && max_height < 10000); 28 | _ASSERT(channels >= 1 && channels <= 4); 29 | 30 | releaseMemory(); 31 | 32 | m_reserve_width = max_width; 33 | m_reserve_height = max_height; 34 | m_reserve_channels = channels; 35 | 36 | int width_height = m_reserve_width * m_reserve_height; 37 | int width_height_channel = width_height * m_reserve_channels; 38 | 39 | m_left_pass_color = new float[width_height_channel]; 40 | m_left_pass_factor = new float[width_height]; 41 | 42 | m_right_pass_color = new float[width_height_channel]; 43 | m_right_pass_factor = new float[width_height]; 44 | 45 | m_down_pass_color = new float[width_height_channel]; 46 | m_down_pass_factor = new float[width_height]; 47 | 48 | m_up_pass_color = new float[width_height_channel]; 49 | m_up_pass_factor = new float[width_height]; 50 | } 51 | 52 | void CRBFilterPlain::releaseMemory() 53 | { 54 | m_reserve_width = 0; 55 | m_reserve_height = 0; 56 | m_reserve_channels = 0; 57 | 58 | if (m_left_pass_color) 59 | { 60 | delete[] m_left_pass_color; 61 | m_left_pass_color = nullptr; 62 | } 63 | 64 | if (m_left_pass_factor) 65 | { 66 | delete[] m_left_pass_factor; 67 | m_left_pass_factor = nullptr; 68 | } 69 | 70 | if (m_right_pass_color) 71 | { 72 | delete[] m_right_pass_color; 73 | m_right_pass_color = nullptr; 74 | } 75 | 76 | if (m_right_pass_factor) 77 | { 78 | delete[] m_right_pass_factor; 79 | m_right_pass_factor = nullptr; 80 | } 81 | 82 | if (m_down_pass_color) 83 | { 84 | delete[] m_down_pass_color; 85 | m_down_pass_color = nullptr; 86 | } 87 | 88 | if (m_down_pass_factor) 89 | { 90 | delete[] m_down_pass_factor; 91 | m_down_pass_factor = nullptr; 92 | } 93 | 94 | if (m_up_pass_color) 95 | { 96 | delete[] m_up_pass_color; 97 | m_up_pass_color = nullptr; 98 | } 99 | 100 | if (m_up_pass_factor) 101 | { 102 | delete[] m_up_pass_factor; 103 | m_up_pass_factor = nullptr; 104 | } 105 | } 106 | 107 | int CRBFilterPlain::getDiffFactor(const unsigned char* color1, const unsigned char* color2) const 108 | { 109 | int final_diff; 110 | int component_diff[4]; 111 | 112 | // find absolute difference between each component 113 | for (int i = 0; i < m_reserve_channels; i++) 114 | { 115 | component_diff[i] = abs(color1[i] - color2[i]); 116 | } 117 | 118 | // based on number of components, produce a single difference value in the 0-255 range 119 | switch (m_reserve_channels) 120 | { 121 | case 1: 122 | final_diff = component_diff[0]; 123 | break; 124 | 125 | case 2: 126 | final_diff = ((component_diff[0] + component_diff[1]) >> 1); 127 | break; 128 | 129 | case 3: 130 | final_diff = ((component_diff[0] + component_diff[2]) >> 2) + (component_diff[1] >> 1); 131 | break; 132 | 133 | case 4: 134 | final_diff = ((component_diff[0] + component_diff[1] + component_diff[2] + component_diff[3]) >> 2); 135 | break; 136 | 137 | default: 138 | final_diff = 0; 139 | } 140 | 141 | _ASSERT(final_diff >= 0 && final_diff <= 255); 142 | 143 | return final_diff; 144 | } 145 | 146 | // memory must be reserved before calling image filter 147 | // this implementation of filter uses plain C++, single threaded 148 | // channel count must be 3 or 4 (alpha not used) 149 | void CRBFilterPlain::filter(unsigned char* img_src, unsigned char* img_dst, 150 | float sigma_spatial, float sigma_range, 151 | int width, int height, int channel) 152 | { 153 | _ASSERT(img_src); 154 | _ASSERT(img_dst); 155 | _ASSERT(m_reserve_channels == channel); 156 | _ASSERT(m_reserve_width >= width); 157 | _ASSERT(m_reserve_height >= height); 158 | 159 | // compute a lookup table 160 | float alpha_f = static_cast(exp(-sqrt(2.0) / (sigma_spatial * 255))); 161 | float inv_alpha_f = 1.f - alpha_f; 162 | 163 | 164 | float range_table_f[QX_DEF_CHAR_MAX + 1]; 165 | float inv_sigma_range = 1.0f / (sigma_range * QX_DEF_CHAR_MAX); 166 | { 167 | float ii = 0.f; 168 | for (int i = 0; i <= QX_DEF_CHAR_MAX; i++, ii -= 1.f) 169 | { 170 | range_table_f[i] = alpha_f * exp(ii * inv_sigma_range); 171 | } 172 | } 173 | 174 | /////////////// 175 | // Left pass 176 | { 177 | const unsigned char* src_color = img_src; 178 | float* left_pass_color = m_left_pass_color; 179 | float* left_pass_factor = m_left_pass_factor; 180 | 181 | for (int y = 0; y < height; y++) 182 | { 183 | const unsigned char* src_prev = src_color; 184 | const float* prev_factor = left_pass_factor; 185 | const float* prev_color = left_pass_color; 186 | 187 | // process 1st pixel separately since it has no previous 188 | *left_pass_factor++ = 1.f; 189 | for (int c = 0; c < channel; c++) 190 | { 191 | *left_pass_color++ = *src_color++; 192 | } 193 | 194 | // handle other pixels 195 | for (int x = 1; x < width; x++) 196 | { 197 | // determine difference in pixel color between current and previous 198 | // calculation is different depending on number of channels 199 | int diff = getDiffFactor(src_color, src_prev); 200 | src_prev = src_color; 201 | 202 | float alpha_f = range_table_f[diff]; 203 | 204 | *left_pass_factor++ = inv_alpha_f + alpha_f * (*prev_factor++); 205 | 206 | for (int c = 0; c < channel; c++) 207 | { 208 | *left_pass_color++ = inv_alpha_f * (*src_color++) + alpha_f * (*prev_color++); 209 | } 210 | } 211 | } 212 | } 213 | 214 | /////////////// 215 | // Right pass 216 | { 217 | // start from end and then go up to begining 218 | int last_index = width * height * channel - 1; 219 | const unsigned char* src_color = img_src + last_index; 220 | float* right_pass_color = m_right_pass_color + last_index; 221 | float* right_pass_factor = m_right_pass_factor + width * height - 1; 222 | 223 | for (int y = 0; y < height; y++) 224 | { 225 | const unsigned char* src_prev = src_color; 226 | const float* prev_factor = right_pass_factor; 227 | const float* prev_color = right_pass_color; 228 | 229 | // process 1st pixel separately since it has no previous 230 | *right_pass_factor-- = 1.f; 231 | for (int c = 0; c < channel; c++) 232 | { 233 | *right_pass_color-- = *src_color--; 234 | } 235 | 236 | // handle other pixels 237 | for (int x = 1; x < width; x++) 238 | { 239 | // determine difference in pixel color between current and previous 240 | // calculation is different depending on number of channels 241 | int diff = getDiffFactor(src_color, src_color - 3); 242 | // src_prev = src_color; 243 | 244 | float alpha_f = range_table_f[diff]; 245 | 246 | *right_pass_factor-- = inv_alpha_f + alpha_f * (*prev_factor--); 247 | 248 | for (int c = 0; c < channel; c++) 249 | { 250 | *right_pass_color-- = inv_alpha_f * (*src_color--) + alpha_f * (*prev_color--); 251 | } 252 | } 253 | } 254 | } 255 | 256 | // vertical pass will be applied on top on horizontal pass, while using pixel differences from original image 257 | // result color stored in 'm_left_pass_color' and vertical pass will use it as source color 258 | { 259 | float* img_out = m_left_pass_color; // use as temporary buffer 260 | const float* left_pass_color = m_left_pass_color; 261 | const float* left_pass_factor = m_left_pass_factor; 262 | const float* right_pass_color = m_right_pass_color; 263 | const float* right_pass_factor = m_right_pass_factor; 264 | 265 | int width_height = width * height; 266 | for (int i = 0; i < width_height; i++) 267 | { 268 | // average color divided by average factor 269 | float factor = 1.f / ((*left_pass_factor++) + (*right_pass_factor++)); 270 | for (int c = 0; c < channel; c++) 271 | { 272 | *img_out++ = (factor * ((*left_pass_color++) + (*right_pass_color++))); 273 | } 274 | } 275 | } 276 | 277 | /////////////// 278 | // Down pass 279 | { 280 | const float* src_color_hor = m_left_pass_color; // result of horizontal pass filter 281 | 282 | const unsigned char* src_color = img_src; 283 | float* down_pass_color = m_down_pass_color; 284 | float* down_pass_factor = m_down_pass_factor; 285 | 286 | const unsigned char* src_prev = src_color; 287 | const float* prev_color = down_pass_color; 288 | const float* prev_factor = down_pass_factor; 289 | 290 | // 1st line done separately because no previous line 291 | for (int x = 0; x < width; x++) 292 | { 293 | *down_pass_factor++ = 1.f; 294 | for (int c = 0; c < channel; c++) 295 | { 296 | *down_pass_color++ = *src_color_hor++; 297 | } 298 | src_color += channel; 299 | } 300 | 301 | // handle other lines 302 | for (int y = 1; y < height; y++) 303 | { 304 | for (int x = 0; x < width; x++) 305 | { 306 | // determine difference in pixel color between current and previous 307 | // calculation is different depending on number of channels 308 | int diff = getDiffFactor(src_color, src_prev); 309 | src_prev += channel; 310 | src_color += channel; 311 | 312 | float alpha_f = range_table_f[diff]; 313 | 314 | *down_pass_factor++ = inv_alpha_f + alpha_f * (*prev_factor++); 315 | 316 | for (int c = 0; c < channel; c++) 317 | { 318 | *down_pass_color++ = inv_alpha_f * (*src_color_hor++) + alpha_f * (*prev_color++); 319 | } 320 | } 321 | } 322 | } 323 | 324 | /////////////// 325 | // Up pass 326 | { 327 | // start from end and then go up to begining 328 | int last_index = width * height * channel - 1; 329 | const unsigned char* src_color = img_src + last_index; 330 | const float* src_color_hor = m_left_pass_color + last_index; // result of horizontal pass filter 331 | float* up_pass_color = m_up_pass_color + last_index; 332 | float* up_pass_factor = m_up_pass_factor + (width * height - 1); 333 | 334 | // const unsigned char* src_prev = src_color; 335 | const float* prev_color = up_pass_color; 336 | const float* prev_factor = up_pass_factor; 337 | 338 | // 1st line done separately because no previous line 339 | for (int x = 0; x < width; x++) 340 | { 341 | *up_pass_factor-- = 1.f; 342 | for (int c = 0; c < channel; c++) 343 | { 344 | *up_pass_color-- = *src_color_hor--; 345 | } 346 | src_color -= channel; 347 | } 348 | 349 | // handle other lines 350 | for (int y = 1; y < height; y++) 351 | { 352 | for (int x = 0; x < width; x++) 353 | { 354 | // determine difference in pixel color between current and previous 355 | // calculation is different depending on number of channels 356 | src_color -= channel; 357 | int diff = getDiffFactor(src_color, src_color + width * channel); 358 | 359 | float alpha_f = range_table_f[diff]; 360 | 361 | *up_pass_factor-- = inv_alpha_f + alpha_f * (*prev_factor--); 362 | 363 | for (int c = 0; c < channel; c++) 364 | { 365 | *up_pass_color-- = inv_alpha_f * (*src_color_hor--) + alpha_f * (*prev_color--); 366 | } 367 | } 368 | } 369 | } 370 | 371 | /////////////// 372 | // average result of vertical pass is written to output buffer 373 | { 374 | const float* down_pass_color = m_down_pass_color; 375 | const float* down_pass_factor = m_down_pass_factor; 376 | const float* up_pass_color = m_up_pass_color; 377 | const float* up_pass_factor = m_up_pass_factor; 378 | 379 | int width_height = width * height; 380 | for (int i = 0; i < width_height; i++) 381 | { 382 | // average color divided by average factor 383 | float factor = 1.f / ((*up_pass_factor++) + (*down_pass_factor++)); 384 | for (int c = 0; c < channel; c++) 385 | { 386 | *img_dst++ = (unsigned char)(factor * ((*up_pass_color++) + (*down_pass_color++))); 387 | } 388 | } 389 | } 390 | } 391 | -------------------------------------------------------------------------------- /rbf.hpp: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDE_RBF 2 | #define INCLUDE_RBF 3 | #include 4 | #include 5 | #define QX_DEF_CHAR_MAX 255 6 | 7 | /* ====================================================================== 8 | 9 | RecursiveBF: A lightweight library for recursive bilateral filtering. 10 | 11 | ------------------------------------------------------------------------- 12 | 13 | Intro: Recursive bilateral filtering (developed by Qingxiong Yang) 14 | is pretty fast compared with most edge-preserving filtering 15 | methods. 16 | 17 | - computational complexity is linear in both input size and 18 | dimensionality 19 | - takes about 43 ms to process a one mega-pixel color image 20 | (i7 1.8GHz & 4GB memory) 21 | - about 18x faster than Fast high-dimensional filtering 22 | using the permutohedral lattice 23 | - about 86x faster than Gaussian kd-trees for fast high- 24 | dimensional filtering 25 | 26 | 27 | Usage: // ---------------------------------------------------------- 28 | // Basic Usage 29 | // ---------------------------------------------------------- 30 | 31 | unsigned char * img = ...; // input image 32 | unsigned char * img_out = 0; // output image 33 | int width = ..., height = ..., channel = ...; // image size 34 | recursive_bf(img, img_out, 35 | sigma_spatial, sigma_range, 36 | width, height, channel); 37 | 38 | // ---------------------------------------------------------- 39 | // Advanced: using external buffer for better performance 40 | // ---------------------------------------------------------- 41 | 42 | unsigned char * img = ...; // input image 43 | unsigned char * img_out = 0; // output image 44 | int width = ..., height = ..., channel = ...; // image size 45 | float * buffer = new float[ // external buf 46 | ( width * height* channel 47 | + width * height 48 | + width * channel 49 | + width) * 2]; 50 | recursive_bf(img, img_out, 51 | sigma_spatial, sigma_range, 52 | width, height, channel, 53 | buffer); 54 | delete[] buffer; 55 | 56 | 57 | Notice: Large sigma_spatial/sigma_range parameter may results in 58 | visible artifact which can be removed by an additional 59 | filter with small sigma_spatial/sigma_range parameter. 60 | 61 | ------------------------------------------------------------------------- 62 | 63 | Reference: Qingxiong Yang, Recursive Bilateral Filtering, 64 | European Conference on Computer Vision (ECCV) 2012, 399-413. 65 | 66 | ====================================================================== */ 67 | 68 | inline void recursive_bf( 69 | unsigned char * img_in, 70 | unsigned char *& img_out, 71 | float sigma_spatial, float sigma_range, 72 | int width, int height, int channel, 73 | float * buffer /*= 0*/); 74 | 75 | // ---------------------------------------------------------------------- 76 | 77 | inline void _recursive_bf( 78 | unsigned char * img, 79 | float sigma_spatial, float sigma_range, 80 | int width, int height, int channel, 81 | float * buffer = 0) 82 | { 83 | const int width_height = width * height; 84 | const int width_channel = width * channel; 85 | const int width_height_channel = width * height * channel; 86 | 87 | bool is_buffer_internal = (buffer == 0); 88 | if (is_buffer_internal) 89 | buffer = new float[(width_height_channel + width_height 90 | + width_channel + width) * 2]; 91 | 92 | float * img_out_f = buffer; 93 | float * img_temp = &img_out_f[width_height_channel]; 94 | float * map_factor_a = &img_temp[width_height_channel]; 95 | float * map_factor_b = &map_factor_a[width_height]; 96 | float * slice_factor_a = &map_factor_b[width_height]; 97 | float * slice_factor_b = &slice_factor_a[width_channel]; 98 | float * line_factor_a = &slice_factor_b[width_channel]; 99 | float * line_factor_b = &line_factor_a[width]; 100 | 101 | //compute a lookup table 102 | float range_table[QX_DEF_CHAR_MAX + 1]; 103 | float inv_sigma_range = 1.0f / (sigma_range * QX_DEF_CHAR_MAX); 104 | for (int i = 0; i <= QX_DEF_CHAR_MAX; i++) 105 | range_table[i] = static_cast(exp(-i * inv_sigma_range)); 106 | 107 | float alpha = static_cast(exp(-sqrt(2.0) / (sigma_spatial * width))); 108 | float ypr, ypg, ypb, ycr, ycg, ycb; 109 | float fp, fc; 110 | float inv_alpha_ = 1 - alpha; 111 | for (int y = 0; y < height; y++) 112 | { 113 | float * temp_x = &img_temp[y * width_channel]; 114 | unsigned char * in_x = &img[y * width_channel]; 115 | unsigned char * texture_x = &img[y * width_channel]; 116 | *temp_x++ = ypr = *in_x++; 117 | *temp_x++ = ypg = *in_x++; 118 | *temp_x++ = ypb = *in_x++; 119 | unsigned char tpr = *texture_x++; 120 | unsigned char tpg = *texture_x++; 121 | unsigned char tpb = *texture_x++; 122 | 123 | float * temp_factor_x = &map_factor_a[y * width]; 124 | *temp_factor_x++ = fp = 1; 125 | 126 | // from left to right 127 | for (int x = 1; x < width; x++) 128 | { 129 | unsigned char tcr = *texture_x++; 130 | unsigned char tcg = *texture_x++; 131 | unsigned char tcb = *texture_x++; 132 | unsigned char dr = abs(tcr - tpr); 133 | unsigned char dg = abs(tcg - tpg); 134 | unsigned char db = abs(tcb - tpb); 135 | int range_dist = (((dr << 1) + dg + db) >> 2); 136 | float weight = range_table[range_dist]; 137 | float alpha_ = weight*alpha; 138 | *temp_x++ = ycr = inv_alpha_*(*in_x++) + alpha_*ypr; 139 | *temp_x++ = ycg = inv_alpha_*(*in_x++) + alpha_*ypg; 140 | *temp_x++ = ycb = inv_alpha_*(*in_x++) + alpha_*ypb; 141 | tpr = tcr; tpg = tcg; tpb = tcb; 142 | ypr = ycr; ypg = ycg; ypb = ycb; 143 | *temp_factor_x++ = fc = inv_alpha_ + alpha_*fp; 144 | fp = fc; 145 | } 146 | *--temp_x; *temp_x = 0.5f*((*temp_x) + (*--in_x)); 147 | *--temp_x; *temp_x = 0.5f*((*temp_x) + (*--in_x)); 148 | *--temp_x; *temp_x = 0.5f*((*temp_x) + (*--in_x)); 149 | tpr = *--texture_x; 150 | tpg = *--texture_x; 151 | tpb = *--texture_x; 152 | ypr = *in_x; ypg = *in_x; ypb = *in_x; 153 | 154 | *--temp_factor_x; *temp_factor_x = 0.5f*((*temp_factor_x) + 1); 155 | fp = 1; 156 | 157 | // from right to left 158 | for (int x = width - 2; x >= 0; x--) 159 | { 160 | unsigned char tcr = *--texture_x; 161 | unsigned char tcg = *--texture_x; 162 | unsigned char tcb = *--texture_x; 163 | unsigned char dr = abs(tcr - tpr); 164 | unsigned char dg = abs(tcg - tpg); 165 | unsigned char db = abs(tcb - tpb); 166 | int range_dist = (((dr << 1) + dg + db) >> 2); 167 | float weight = range_table[range_dist]; 168 | float alpha_ = weight * alpha; 169 | 170 | ycr = inv_alpha_ * (*--in_x) + alpha_ * ypr; 171 | ycg = inv_alpha_ * (*--in_x) + alpha_ * ypg; 172 | ycb = inv_alpha_ * (*--in_x) + alpha_ * ypb; 173 | *--temp_x; *temp_x = 0.5f*((*temp_x) + ycr); 174 | *--temp_x; *temp_x = 0.5f*((*temp_x) + ycg); 175 | *--temp_x; *temp_x = 0.5f*((*temp_x) + ycb); 176 | tpr = tcr; tpg = tcg; tpb = tcb; 177 | ypr = ycr; ypg = ycg; ypb = ycb; 178 | 179 | fc = inv_alpha_ + alpha_*fp; 180 | *--temp_factor_x; 181 | *temp_factor_x = 0.5f*((*temp_factor_x) + fc); 182 | fp = fc; 183 | } 184 | } 185 | alpha = static_cast(exp(-sqrt(2.0) / (sigma_spatial * height))); 186 | inv_alpha_ = 1 - alpha; 187 | float * ycy, * ypy, * xcy; 188 | unsigned char * tcy, * tpy; 189 | memcpy(img_out_f, img_temp, sizeof(float)* width_channel); 190 | 191 | float * in_factor = map_factor_a; 192 | float*ycf, *ypf, *xcf; 193 | memcpy(map_factor_b, in_factor, sizeof(float) * width); 194 | for (int y = 1; y < height; y++) 195 | { 196 | tpy = &img[(y - 1) * width_channel]; 197 | tcy = &img[y * width_channel]; 198 | xcy = &img_temp[y * width_channel]; 199 | ypy = &img_out_f[(y - 1) * width_channel]; 200 | ycy = &img_out_f[y * width_channel]; 201 | 202 | xcf = &in_factor[y * width]; 203 | ypf = &map_factor_b[(y - 1) * width]; 204 | ycf = &map_factor_b[y * width]; 205 | for (int x = 0; x < width; x++) 206 | { 207 | unsigned char dr = abs((*tcy++) - (*tpy++)); 208 | unsigned char dg = abs((*tcy++) - (*tpy++)); 209 | unsigned char db = abs((*tcy++) - (*tpy++)); 210 | int range_dist = (((dr << 1) + dg + db) >> 2); 211 | float weight = range_table[range_dist]; 212 | float alpha_ = weight*alpha; 213 | for (int c = 0; c < channel; c++) 214 | *ycy++ = inv_alpha_*(*xcy++) + alpha_*(*ypy++); 215 | *ycf++ = inv_alpha_*(*xcf++) + alpha_*(*ypf++); 216 | } 217 | } 218 | int h1 = height - 1; 219 | ycf = line_factor_a; 220 | ypf = line_factor_b; 221 | memcpy(ypf, &in_factor[h1 * width], sizeof(float) * width); 222 | for (int x = 0; x < width; x++) 223 | map_factor_b[h1 * width + x] = 0.5f*(map_factor_b[h1 * width + x] + ypf[x]); 224 | 225 | ycy = slice_factor_a; 226 | ypy = slice_factor_b; 227 | memcpy(ypy, &img_temp[h1 * width_channel], sizeof(float)* width_channel); 228 | int k = 0; 229 | for (int x = 0; x < width; x++) { 230 | for (int c = 0; c < channel; c++) { 231 | int idx = (h1 * width + x) * channel + c; 232 | img_out_f[idx] = 0.5f*(img_out_f[idx] + ypy[k++]) / map_factor_b[h1 * width + x]; 233 | } 234 | } 235 | 236 | for (int y = h1 - 1; y >= 0; y--) 237 | { 238 | tpy = &img[(y + 1) * width_channel]; 239 | tcy = &img[y * width_channel]; 240 | xcy = &img_temp[y * width_channel]; 241 | float*ycy_ = ycy; 242 | float*ypy_ = ypy; 243 | float*out_ = &img_out_f[y * width_channel]; 244 | 245 | xcf = &in_factor[y * width]; 246 | float*ycf_ = ycf; 247 | float*ypf_ = ypf; 248 | float*factor_ = &map_factor_b[y * width]; 249 | for (int x = 0; x < width; x++) 250 | { 251 | unsigned char dr = abs((*tcy++) - (*tpy++)); 252 | unsigned char dg = abs((*tcy++) - (*tpy++)); 253 | unsigned char db = abs((*tcy++) - (*tpy++)); 254 | int range_dist = (((dr << 1) + dg + db) >> 2); 255 | float weight = range_table[range_dist]; 256 | float alpha_ = weight*alpha; 257 | 258 | float fcc = inv_alpha_*(*xcf++) + alpha_*(*ypf_++); 259 | *ycf_++ = fcc; 260 | *factor_ = 0.5f * (*factor_ + fcc); 261 | 262 | for (int c = 0; c < channel; c++) 263 | { 264 | float ycc = inv_alpha_*(*xcy++) + alpha_*(*ypy_++); 265 | *ycy_++ = ycc; 266 | *out_ = 0.5f * (*out_ + ycc) / (*factor_); 267 | *out_++; 268 | } 269 | *factor_++; 270 | } 271 | memcpy(ypy, ycy, sizeof(float) * width_channel); 272 | memcpy(ypf, ycf, sizeof(float) * width); 273 | } 274 | 275 | for (int i = 0; i < width_height_channel; ++i) 276 | img[i] = static_cast(img_out_f[i]); 277 | 278 | if (is_buffer_internal) 279 | delete[] buffer; 280 | } 281 | 282 | 283 | inline void recursive_bf( 284 | unsigned char * img_in, 285 | unsigned char *& img_out, 286 | float sigma_spatial, float sigma_range, 287 | int width, int height, int channel, 288 | float * buffer = 0) 289 | { 290 | if (img_out == 0) 291 | img_out = new unsigned char[width * height * channel]; 292 | for (int i = 0; i < width * height * channel; ++i) 293 | img_out[i] = img_in[i]; 294 | _recursive_bf(img_out, sigma_spatial, sigma_range, width, height, channel, buffer); 295 | } 296 | 297 | #endif // INCLUDE_RBF 298 | -------------------------------------------------------------------------------- /RecursiveBilateralFilter.cpp: -------------------------------------------------------------------------------- 1 | // Purpose of this file is to run a series of tests on several images using different implementations of the 2 | // Recursive Bilaterial Filter, and to show rough time estimate for each run 3 | 4 | #include "stdafx.h" 5 | #include "stb_image.h" 6 | #include "stb_image_write.h" 7 | #include "rbf.hpp" 8 | #include 9 | #include 10 | #include 11 | #include "RBFilter_SSE2.h" 12 | #include "RBFilter_AVX2.h" 13 | #include 14 | 15 | using namespace std; 16 | 17 | // main filter strength controls 18 | const float sigma_spatial = 0.12f; 19 | const float sigma_range = 0.09f; 20 | 21 | // number of test runs per image, for better average time measurement 22 | // if running debug mode, use small number so it's faster 23 | #ifdef _DEBUG 24 | const int test_runs = 1; 25 | #else 26 | const int test_runs = 100; 27 | #endif 28 | 29 | // path where files are located, you may need to change this 30 | const char images_folder_path[] = "./images/"; 31 | 32 | // test images: 33 | const char file_name_testGirl[] = "testGirl.jpg"; // size: 448 x 626 34 | const char file_name_house[] = "Thefarmhouse.jpg"; // size: 1440 x 1080 35 | const char file_name_testpattern[] = "testpatern5.png"; // size: 1920 x 1080 36 | 37 | 38 | // timer uses 'test_runs' as divisor 39 | class TestRunTimer 40 | { 41 | clock_t begTime; 42 | 43 | public: 44 | void start() { begTime = clock(); } 45 | float elapsedTimeMS() { return float(clock() - begTime) / (float)test_runs; } 46 | }; 47 | 48 | // utility for setting output file name 49 | template 50 | char* modifyFilePath(char (&file_path)[_Size], const char* suffix) 51 | { 52 | size_t l = strlen(file_path); 53 | // get rid of old extension 54 | for (size_t i = l - 1; i > 0; i--) 55 | { 56 | if (file_path[i] == '.') 57 | { 58 | file_path[i] = 0; 59 | break; 60 | } 61 | } 62 | 63 | // add current sigma values just for clarity 64 | char extra_text[64]; 65 | sprintf_s(extra_text, "%0.3f_%0.3f", sigma_spatial, sigma_range); 66 | 67 | // add suffix 68 | strcat_s(file_path, "_"); 69 | strcat_s(file_path, suffix); 70 | strcat_s(file_path, "_"); 71 | strcat_s(file_path, extra_text); 72 | strcat_s(file_path, ".png"); // force PNG format 73 | 74 | return file_path; 75 | } 76 | 77 | // using original implementation, source code from 78 | // https://github.com/ufoym/RecursiveBF 79 | void testRunRecursiveBF_Original(const char* image_name) 80 | { 81 | cout << "\nImage: " << image_name; 82 | char file_path[256]; 83 | strcpy_s(file_path, images_folder_path); 84 | strcat_s(file_path, image_name); 85 | 86 | int width, height, channel; 87 | unsigned char * img = stbi_load(file_path, &width, &height, &channel, 3); 88 | if (!img) 89 | { 90 | cout << "\nFailed to load image path: " << file_path; 91 | return; 92 | } 93 | cout << ", size: " << width << " x " << height; 94 | channel = 3; // require 3 channel for this test 95 | unsigned char * img_out = nullptr; 96 | TestRunTimer timer; 97 | 98 | // memory reserve for filter algorithm before timer start 99 | float * buffer = new float[(width * height* channel + width * height + width * channel + width) * 2]; 100 | 101 | timer.start(); 102 | for (int i = 0; i < test_runs; ++i) 103 | recursive_bf(img, img_out, sigma_spatial, sigma_range, width, height, channel, buffer); 104 | 105 | cout << ", time ms: " << timer.elapsedTimeMS(); 106 | 107 | delete[] buffer; 108 | 109 | modifyFilePath(file_path, "RBF"); 110 | stbi_write_png(file_path, width, height, channel, img_out, width * 3); 111 | 112 | delete[] img; 113 | delete[] img_out; 114 | } 115 | 116 | 117 | // using optimized SSE2 with optional multithreading, single stage (non-pipelined) 118 | void testRunRecursiveBF_SSE2_mt(const char* image_name, int thread_count) 119 | { 120 | cout << "\nImage: " << image_name; 121 | char file_path[256]; 122 | strcpy_s(file_path, images_folder_path); 123 | strcat_s(file_path, image_name); 124 | 125 | int width, height, channel; 126 | unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4); 127 | if (!img) 128 | { 129 | cout << "\nFailed to load image path: " << file_path; 130 | return; 131 | } 132 | cout << ", size: " << width << " x " << height; 133 | channel = 4; // require 4 channel for this test 134 | 135 | CRBFilterSSE2 rbf_object; 136 | bool success = rbf_object.initialize(width, height, thread_count, false); 137 | if (!success) 138 | { 139 | cout << "\nCRBFilterSSE2 failed to initialize for some reason"; 140 | delete[] img; 141 | return; 142 | } 143 | rbf_object.setSigma(sigma_spatial, sigma_range); 144 | 145 | unsigned char * img_out = new unsigned char[width * height * 4]; 146 | 147 | 148 | TestRunTimer timer; 149 | timer.start(); 150 | 151 | for (int i = 0; i < test_runs; ++i) 152 | success = rbf_object.filter(img_out, img, width, height, width * 4); 153 | 154 | if (success) 155 | { 156 | cout << ", time ms: " << timer.elapsedTimeMS(); 157 | } 158 | else // fail 159 | { 160 | cout << "\nCRBFilterSSE2::filter failed for some reason"; 161 | } 162 | 163 | char suffix[64]; 164 | sprintf_s(suffix, "SSE2_%dt", thread_count); 165 | modifyFilePath(file_path, suffix); 166 | stbi_write_png(file_path, width, height, channel, img_out, width * 4); 167 | 168 | delete[] img; 169 | delete[] img_out; 170 | } 171 | 172 | // using optimized SSE2 with optional multithreading, pipelined 2 stages 173 | void testRunRecursiveBF_SSE2_Pipelined(const char* image_name, int thread_count) 174 | { 175 | cout << "\nImage: " << image_name; 176 | char file_path[256]; 177 | strcpy_s(file_path, images_folder_path); 178 | strcat_s(file_path, image_name); 179 | 180 | int width, height, channel; 181 | unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4); 182 | if (!img) 183 | { 184 | cout << "\nFailed to load image path: " << file_path; 185 | return; 186 | } 187 | cout << ", size: " << width << " x " << height; 188 | channel = 4; // require 4 channel for this test 189 | 190 | CRBFilterSSE2 rbf_object; 191 | bool success = rbf_object.initialize(width, height, thread_count, true); 192 | if (!success) 193 | { 194 | cout << "\nCRBFilterSSE2 failed to initialize for some reason"; 195 | delete[] img; 196 | return; 197 | } 198 | rbf_object.setSigma(sigma_spatial, sigma_range); 199 | 200 | // need 2 output buffers, one for each stage 201 | unsigned char * img_out[2]; 202 | img_out[0] = new unsigned char[width * height * 4]; 203 | img_out[1] = new unsigned char[width * height * 4]; 204 | 205 | TestRunTimer timer; 206 | timer.start(); 207 | 208 | for (int i = 0; i < test_runs; ++i) 209 | success = rbf_object.filterPipePush(img_out[i&1], img, width, height, width * 4); 210 | 211 | rbf_object.filterPipeFlush(); 212 | 213 | if (success) 214 | { 215 | cout << ", time ms: " << timer.elapsedTimeMS(); 216 | } 217 | else // fail 218 | { 219 | cout << "\nCRBFilterSSE2::filterPipePush failed for some reason"; 220 | } 221 | 222 | char suffix[64]; 223 | sprintf_s(suffix, "SSE2_Pipe_%dt", thread_count); 224 | modifyFilePath(file_path, suffix); 225 | stbi_write_png(file_path, width, height, channel, img_out[0], width * 4); 226 | 227 | delete[] img; 228 | delete[] img_out[0]; 229 | delete[] img_out[1]; 230 | } 231 | 232 | 233 | // using optimized AVX2 with optional multithreading, single stage (non-pipelined) 234 | void testRunRecursiveBF_AVX2_mt(const char* image_name, int thread_count) 235 | { 236 | cout << "\nImage: " << image_name; 237 | char file_path[256]; 238 | strcpy_s(file_path, images_folder_path); 239 | strcat_s(file_path, image_name); 240 | 241 | int width, height, channel; 242 | unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4); 243 | if (!img) 244 | { 245 | cout << "\nFailed to load image path: " << file_path; 246 | return; 247 | } 248 | cout << ", size: " << width << " x " << height; 249 | channel = 4; // require 4 channel for this test 250 | 251 | CRBFilterAVX2 rbf_object; 252 | bool success = rbf_object.initialize(width, height, thread_count, false); 253 | if (!success) 254 | { 255 | cout << "\nCRBFilterAVX2 failed to initialize for some reason"; 256 | delete[] img; 257 | return; 258 | } 259 | rbf_object.setSigma(sigma_spatial, sigma_range); 260 | 261 | int pitch = rbf_object.getOptimalPitch(width); 262 | unsigned char * img_out; 263 | 264 | // setup 32 byte aligned memory buffers for input and output, using optimal pitch 265 | { 266 | img_out = (unsigned char*)_aligned_malloc(pitch * height, 32); 267 | 268 | // move source image to aligned memory 269 | unsigned char* buffer = (unsigned char*)_aligned_malloc(pitch * height, 32); 270 | for (int y = 0; y < height; y++) 271 | { 272 | memcpy(buffer + y * pitch, img + y * width * 4, width * 4); 273 | } 274 | delete[] img; 275 | img = buffer; 276 | } 277 | 278 | TestRunTimer timer; 279 | timer.start(); 280 | 281 | for (int i = 0; i < test_runs; ++i) 282 | success = rbf_object.filter(img_out, img, width, height, pitch); 283 | 284 | if (success) 285 | { 286 | cout << ", time ms: " << timer.elapsedTimeMS(); 287 | } 288 | else // fail 289 | { 290 | cout << "\nCRBFilterAVX2::filter failed for some reason"; 291 | } 292 | 293 | char suffix[64]; 294 | sprintf_s(suffix, "AVX2_%dt", thread_count); 295 | modifyFilePath(file_path, suffix); 296 | stbi_write_png(file_path, width, height, channel, img_out, pitch); 297 | 298 | _aligned_free(img); 299 | _aligned_free(img_out); 300 | } 301 | 302 | // using optimized AVX2 with optional multithreading, pipelined 2 stages, memory aligned 303 | void testRunRecursiveBF_AVX2_Pipelined(const char* image_name, int thread_count) 304 | { 305 | cout << "\nImage: " << image_name; 306 | char file_path[256]; 307 | strcpy_s(file_path, images_folder_path); 308 | strcat_s(file_path, image_name); 309 | 310 | int width, height, channel; 311 | unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4); 312 | if (!img) 313 | { 314 | cout << "\nFailed to load image path: " << file_path; 315 | return; 316 | } 317 | cout << ", size: " << width << " x " << height; 318 | channel = 4; // require 4 channel for this test 319 | 320 | CRBFilterAVX2 rbf_object; 321 | bool success = rbf_object.initialize(width, height, thread_count, true); 322 | if (!success) 323 | { 324 | cout << "\nCRBFilterAVX2 failed to initialize for some reason"; 325 | delete[] img; 326 | return; 327 | } 328 | rbf_object.setSigma(sigma_spatial, sigma_range); 329 | 330 | int pitch = rbf_object.getOptimalPitch(width); 331 | unsigned char* img_out[2]; 332 | 333 | // setup 32 byte aligned memory buffers for input and output, using optimal pitch 334 | { 335 | img_out[0] = (unsigned char*)_aligned_malloc(pitch * height, 32); 336 | img_out[1] = (unsigned char*)_aligned_malloc(pitch * height, 32); 337 | 338 | // move source image to aligned memory 339 | unsigned char* buffer = (unsigned char*)_aligned_malloc(pitch * height, 32); 340 | for (int y = 0; y < height; y++) 341 | { 342 | memcpy(buffer + y * pitch, img + y * width * 4, width * 4); 343 | } 344 | delete[] img; 345 | img = buffer; 346 | } 347 | 348 | TestRunTimer timer; 349 | timer.start(); 350 | 351 | for (int i = 0; i < test_runs; ++i) 352 | success = rbf_object.filterPipePush(img_out[i & 1], img, width, height, width * 4); 353 | 354 | rbf_object.filterPipeFlush(); 355 | 356 | if (success) 357 | { 358 | cout << ", time ms: " << timer.elapsedTimeMS(); 359 | } 360 | else // fail 361 | { 362 | cout << "\nCRBFilterAVX2::filterPipePush failed for some reason"; 363 | } 364 | 365 | char suffix[64]; 366 | sprintf_s(suffix, "AVX2_Pipe_%dt", thread_count); 367 | modifyFilePath(file_path, suffix); 368 | stbi_write_png(file_path, width, height, channel, img_out[0], pitch); 369 | 370 | _aligned_free(img); 371 | _aligned_free(img_out[0]); 372 | _aligned_free(img_out[1]); 373 | } 374 | 375 | ///////////////////////////////////////////////////////////////////////////// 376 | 377 | int main() 378 | { 379 | cout << "test run \n"; 380 | cout << fixed << setprecision(1); 381 | 382 | //////////////////////// 383 | cout << "\nOriginal Recursive Bilateral Filter implementation"; 384 | // image: testpattern 385 | testRunRecursiveBF_Original(file_name_testpattern); 386 | // image: house 387 | testRunRecursiveBF_Original(file_name_house); 388 | // image: testGirl 389 | testRunRecursiveBF_Original(file_name_testGirl); 390 | 391 | 392 | //////////////////////// 393 | cout << "\n\nOptimized SSE2 single threaded, single stage (non-pipelined)"; 394 | // image: testpattern 395 | testRunRecursiveBF_SSE2_mt(file_name_testpattern, 1); 396 | // image: house 397 | testRunRecursiveBF_SSE2_mt(file_name_house, 1); 398 | // image: testGirl 399 | testRunRecursiveBF_SSE2_mt(file_name_testGirl, 1); 400 | 401 | //////////////////////// 402 | cout << "\n\nOptimized SSE2 2x multithreading, single stage (non-pipelined)"; 403 | // image: testpattern 404 | testRunRecursiveBF_SSE2_mt(file_name_testpattern, 2); 405 | // image: house 406 | testRunRecursiveBF_SSE2_mt(file_name_house, 2); 407 | // image: testGirl 408 | testRunRecursiveBF_SSE2_mt(file_name_testGirl, 2); 409 | 410 | //////////////////////// 411 | cout << "\n\nOptimized SSE2 4x multithreading, single stage (non-pipelined)"; 412 | // image: testpattern 413 | testRunRecursiveBF_SSE2_mt(file_name_testpattern, 4); 414 | // image: house 415 | testRunRecursiveBF_SSE2_mt(file_name_house, 4); 416 | // image: testGirl 417 | testRunRecursiveBF_SSE2_mt(file_name_testGirl, 4); 418 | 419 | //////////////////////// 420 | cout << "\n\nOptimized SSE2 4x2 thread pipelined 2 stages"; 421 | // image: testpattern 422 | testRunRecursiveBF_SSE2_Pipelined(file_name_testpattern, 4); 423 | // image: house 424 | testRunRecursiveBF_SSE2_Pipelined(file_name_house, 4); 425 | // image: testGirl 426 | testRunRecursiveBF_SSE2_Pipelined(file_name_testGirl, 4); 427 | 428 | //////////////////////// 429 | cout << "\n\nOptimized AVX2 single threaded, single stage (non-pipelined), memory aligned"; 430 | // image: testpattern 431 | testRunRecursiveBF_AVX2_mt(file_name_testpattern, 1); 432 | // image: house 433 | testRunRecursiveBF_AVX2_mt(file_name_house, 1); 434 | // image: testGirl 435 | testRunRecursiveBF_AVX2_mt(file_name_testGirl, 1); 436 | 437 | //////////////////////// 438 | cout << "\n\nOptimized AVX2 2x multithreading, single stage (non-pipelined), memory aligned"; 439 | // image: testpattern 440 | testRunRecursiveBF_AVX2_mt(file_name_testpattern, 2); 441 | // image: house 442 | testRunRecursiveBF_AVX2_mt(file_name_house, 2); 443 | // image: testGirl 444 | testRunRecursiveBF_AVX2_mt(file_name_testGirl, 2); 445 | 446 | //////////////////////// 447 | cout << "\n\nOptimized AVX2 4x multithreading, single stage (non-pipelined), memory aligned"; 448 | // image: testpattern 449 | testRunRecursiveBF_AVX2_mt(file_name_testpattern, 4); 450 | // image: house 451 | testRunRecursiveBF_AVX2_mt(file_name_house, 4); 452 | // image: testGirl 453 | testRunRecursiveBF_AVX2_mt(file_name_testGirl, 4); 454 | 455 | //////////////////////// 456 | cout << "\n\nOptimized AVX2 4x2 thread pipelined 2 stages, memory aligned"; 457 | // image: testpattern 458 | testRunRecursiveBF_AVX2_Pipelined(file_name_testpattern, 4); 459 | // image: house 460 | testRunRecursiveBF_AVX2_Pipelined(file_name_house, 4); 461 | // image: testGirl 462 | testRunRecursiveBF_AVX2_Pipelined(file_name_testGirl, 4); 463 | 464 | cout << "\nFinish"; 465 | cin.get(); 466 | 467 | return 0; 468 | } 469 | 470 | -------------------------------------------------------------------------------- /RBFilter_SSE2.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "RBFilter_SSE2.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | #define MAX_RANGE_TABLE_SIZE 255 12 | #define ALIGN_SIZE 16 13 | 14 | // only 1 of following 2 should be defined 15 | #define EDGE_COLOR_USE_MAXIMUM 16 | //#define EDGE_COLOR_USE_ADDITION 17 | 18 | // if EDGE_COLOR_USE_MAXIMUM is defined, then edge color detection works by calculating 19 | // maximum difference among 3 components (RGB) of 2 colors, which tends to result in lower differences (since only largest among 3 is selected) 20 | // if EDGE_COLOR_USE_ADDITION is defined, then edge color detection works by calculating 21 | // sum of all 3 components, while enforcing 255 maximum. This method is much more sensitive to small differences 22 | 23 | #if defined(EDGE_COLOR_USE_MAXIMUM) && defined(EDGE_COLOR_USE_ADDITION) 24 | #error Only 1 of those can be defined 25 | #endif 26 | 27 | #if !defined(EDGE_COLOR_USE_MAXIMUM) && !defined(EDGE_COLOR_USE_ADDITION) 28 | #error 1 of those must be defined 29 | #endif 30 | 31 | CRBFilterSSE2::CRBFilterSSE2() 32 | { 33 | m_range_table = new float[MAX_RANGE_TABLE_SIZE + 1]; 34 | memset(m_range_table, 0, (MAX_RANGE_TABLE_SIZE + 1) * sizeof(float)); 35 | } 36 | 37 | CRBFilterSSE2::~CRBFilterSSE2() 38 | { 39 | release(); 40 | 41 | delete[] m_range_table; 42 | } 43 | 44 | bool CRBFilterSSE2::initialize(int width, int height, int thread_count, bool pipelined) 45 | { 46 | // basic sanity check, not strict 47 | if (width < 16 || width > 10000) 48 | return false; 49 | 50 | if (height < 2 || height > 10000) 51 | return false; 52 | 53 | if (thread_count < 1 || thread_count > RBF_MAX_THREADS) 54 | return false; 55 | 56 | release(); 57 | 58 | // round width up to nearest ALIGN_SIZE * thread_count 59 | int round_up = (ALIGN_SIZE / 4) * thread_count; 60 | if (width % round_up) 61 | { 62 | width += round_up - width % round_up; 63 | } 64 | m_reserved_width = width; 65 | m_reserved_height = height; 66 | m_thread_count = thread_count; 67 | 68 | m_stage_buffer[0] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE); 69 | if (!m_stage_buffer[0]) 70 | return false; 71 | 72 | if (pipelined) 73 | { 74 | for (int i = 1; i < STAGE_BUFFER_COUNT; i++) 75 | { 76 | m_stage_buffer[i] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE); 77 | if (!m_stage_buffer[i]) 78 | return false; 79 | } 80 | } 81 | 82 | m_h_line_cache = new (std::nothrow) float*[m_thread_count]; 83 | if (!m_h_line_cache) 84 | return false; 85 | 86 | // zero just in case 87 | for (int i = 0; i < m_thread_count; i++) 88 | m_h_line_cache[i] = nullptr; 89 | 90 | for (int i = 0; i < m_thread_count; i++) 91 | { 92 | m_h_line_cache[i] = (float*)_aligned_malloc(m_reserved_width * 12 * sizeof(float) , ALIGN_SIZE); 93 | if (!m_h_line_cache[i]) 94 | return false; 95 | } 96 | 97 | // if (m_pipelined) 98 | { 99 | m_v_line_cache = new (std::nothrow) float*[m_thread_count]; 100 | if (!m_v_line_cache) 101 | return false; 102 | 103 | for (int i = 0; i < m_thread_count; i++) 104 | m_v_line_cache[i] = nullptr; 105 | 106 | for (int i = 0; i < m_thread_count; i++) 107 | { 108 | m_v_line_cache[i] = (float*)_aligned_malloc((m_reserved_width * 8 * sizeof(float)) / m_thread_count, ALIGN_SIZE); 109 | if (!m_v_line_cache[i]) 110 | return false; 111 | } 112 | } 113 | 114 | 115 | return true; 116 | } 117 | 118 | void CRBFilterSSE2::release() 119 | { 120 | for (int i = 0; i < STAGE_BUFFER_COUNT; i++) 121 | { 122 | if (m_stage_buffer[i]) 123 | { 124 | _aligned_free(m_stage_buffer[i]); 125 | m_stage_buffer[i] = nullptr; 126 | } 127 | } 128 | 129 | if (m_h_line_cache) 130 | { 131 | for (int i = 0; i < m_thread_count; i++) 132 | { 133 | if (m_h_line_cache[i]) 134 | _aligned_free(m_h_line_cache[i]); 135 | } 136 | delete[] m_h_line_cache; 137 | m_h_line_cache = nullptr; 138 | } 139 | 140 | // if (m_pipelined) 141 | { 142 | for (int i = 0; i < m_thread_count; i++) 143 | { 144 | if (m_v_line_cache[i]) 145 | _aligned_free(m_v_line_cache[i]); 146 | } 147 | delete[] m_v_line_cache; 148 | } 149 | m_v_line_cache = nullptr; 150 | 151 | m_reserved_width = 0; 152 | m_reserved_height = 0; 153 | m_thread_count = 0; 154 | m_pipelined = false; 155 | m_filter_counter = 0; 156 | } 157 | 158 | void CRBFilterSSE2::setSigma(float sigma_spatial, float sigma_range) 159 | { 160 | if (m_sigma_spatial != sigma_spatial || m_sigma_range != sigma_range) 161 | { 162 | m_sigma_spatial = sigma_spatial; 163 | m_sigma_range = sigma_range; 164 | 165 | double alpha_f = (exp(-sqrt(2.0) / (sigma_spatial * 255.0))); 166 | m_inv_alpha_f = (float)(1.0 - alpha_f); 167 | double inv_sigma_range = 1.0 / (sigma_range * MAX_RANGE_TABLE_SIZE); 168 | { 169 | double ii = 0.f; 170 | for (int i = 0; i <= MAX_RANGE_TABLE_SIZE; i++, ii -= 1.0) 171 | { 172 | m_range_table[i] = (float)(alpha_f * exp(ii * inv_sigma_range)); 173 | } 174 | } 175 | } 176 | } 177 | 178 | // example of edge color difference calculation from original implementation 179 | // idea is to fit maximum edge color difference as single number in 0-255 range 180 | // colors are added then 2 components are scaled 4x while 1 complement is scaled 2x 181 | // this means 1 of the components is more dominant 182 | 183 | //int getDiffFactor(const unsigned char* color1, const unsigned char* color2) 184 | //{ 185 | // int c1 = abs(color1[0] - color2[0]); 186 | // int c2 = abs(color1[1] - color2[1]); 187 | // int c3 = abs(color1[2] - color2[2]); 188 | // 189 | // return ((c1 + c3) >> 2) + (c2 >> 1); 190 | //} 191 | 192 | 193 | inline void getDiffFactor3x(__m128i pix4, __m128i pix4p, __m128i* diff4x) 194 | { 195 | static __m128i byte_mask = _mm_set1_epi32(255); 196 | 197 | // get absolute difference for each component per pixel 198 | __m128i diff = _mm_sub_epi8(_mm_max_epu8(pix4, pix4p), _mm_min_epu8(pix4, pix4p)); 199 | 200 | #ifdef EDGE_COLOR_USE_MAXIMUM 201 | // get maximum of 3 components 202 | __m128i diff_shift1 = _mm_srli_epi32(diff, 8); // 2nd component 203 | diff = _mm_max_epu8(diff, diff_shift1); 204 | diff_shift1 = _mm_srli_epi32(diff_shift1, 8); // 3rd component 205 | diff = _mm_max_epu8(diff, diff_shift1); 206 | // skip alpha component 207 | diff = _mm_and_si128(diff, byte_mask); // zero out all but 1st byte 208 | #endif 209 | 210 | #ifdef EDGE_COLOR_USE_ADDITION 211 | // add all component differences and saturate 212 | __m128i diff_shift1 = _mm_srli_epi32(diff, 8); // 2nd component 213 | diff = _mm_adds_epu8(diff, diff_shift1); 214 | diff_shift1 = _mm_srli_epi32(diff_shift1, 8); // 3rd component 215 | diff = _mm_adds_epu8(diff, diff_shift1); 216 | diff = _mm_and_si128(diff, byte_mask); // zero out all but 1st byte 217 | #endif 218 | 219 | _mm_store_si128(diff4x, diff); 220 | } 221 | 222 | 223 | void CRBFilterSSE2::horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch) 224 | { 225 | int height_segment = height / m_thread_count; 226 | int buffer_offset = thread_index * height_segment * pitch; 227 | img_src += buffer_offset; 228 | img_dst += buffer_offset; 229 | 230 | if (thread_index + 1 == m_thread_count) // last segment should account for uneven height 231 | height_segment += height % m_thread_count; 232 | 233 | float* line_cache = m_h_line_cache[thread_index]; 234 | const float* range_table = m_range_table; 235 | 236 | __m128 inv_alpha = _mm_set_ps1(m_inv_alpha_f); 237 | __m128 half_value = _mm_set_ps1(0.5f); 238 | __m128i mask_pack = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 239 | __m128i mask_unpack = _mm_setr_epi8(12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1); 240 | 241 | // used to store maximum difference between 2 pixels 242 | __declspec(align(16)) long color_diff[4]; 243 | 244 | for (int y = 0; y < height_segment; y++) 245 | { 246 | ////////////////////// 247 | // right to left pass, results of this pass get stored in 'line_cache' 248 | { 249 | int pixels_left = width - 1; 250 | 251 | // get end of line buffer 252 | float* line_buffer = line_cache + pixels_left * 12; 253 | 254 | /////// 255 | // handle last pixel in row separately as special case 256 | { 257 | const unsigned char* last_src = img_src + (y + 1) * pitch - 4; 258 | 259 | // result color 260 | line_buffer[8] = (float)last_src[0]; 261 | line_buffer[9] = (float)last_src[1]; 262 | line_buffer[10] = (float)last_src[2]; 263 | line_buffer[11] = (float)last_src[3]; 264 | 265 | // premultiplied source 266 | // caching pre-multiplied allows saving 1 multiply operation in 2nd pass loop, not a big difference 267 | line_buffer[4] = m_inv_alpha_f * line_buffer[8]; 268 | line_buffer[5] = m_inv_alpha_f * line_buffer[9]; 269 | line_buffer[6] = m_inv_alpha_f * line_buffer[10]; 270 | line_buffer[7] = m_inv_alpha_f * line_buffer[11]; 271 | } 272 | 273 | // "previous" pixel color 274 | __m128 pixel_prev = _mm_load_ps(line_buffer + 8); 275 | // "previous" pixel factor 276 | __m128 alpha_f_prev4 = _mm_set_ps1(1.f); 277 | 278 | /////// 279 | // handle most middle pixels in 16 byte intervals using xmm registers 280 | // process 4x pixels at a time 281 | int buffer_inc = y * pitch + (pixels_left - 1) * 4 - 16; 282 | const __m128i* src_4xCur = (const __m128i*)(img_src + buffer_inc); 283 | const __m128i* src_4xPrev = (const __m128i*)(img_src + buffer_inc + 4); 284 | 285 | while (pixels_left > 0) // outer loop 4x pixel 286 | { 287 | // load 4x pixel, may read backward past start of buffer, but it's OK since that extra data won't be used 288 | __m128i pix4 = _mm_loadu_si128(src_4xCur--); 289 | __m128i pix4p = _mm_loadu_si128(src_4xPrev--); 290 | 291 | // get color differences 292 | getDiffFactor3x(pix4, pix4p, (__m128i*)color_diff); 293 | 294 | for (int i = 3; i >= 0 && pixels_left-- > 0; i--) // inner loop 295 | { 296 | float alpha_f = range_table[color_diff[i]]; 297 | __m128 alpha_f_4x = _mm_set_ps1(alpha_f); 298 | 299 | // cache weights for next filter pass 300 | line_buffer -= 12; 301 | _mm_store_ps(line_buffer, alpha_f_4x); 302 | 303 | // color factor 304 | alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x); 305 | alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha); 306 | 307 | // unpack current source pixel 308 | __m128i pix1 = _mm_shuffle_epi8(pix4, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 309 | pix4 = _mm_slli_si128(pix4, 4); // shift left so next loop unpacks next pixel data 310 | __m128 pixel_F = _mm_cvtepi32_ps(pix1); // convert to floats 311 | 312 | 313 | // apply color filter 314 | pixel_F = _mm_mul_ps(pixel_F, inv_alpha); 315 | _mm_store_ps(line_buffer + 4, pixel_F); // cache pre-multiplied source color for next filter pass 316 | alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x); 317 | pixel_F = _mm_add_ps(pixel_F, alpha_f_4x); 318 | 319 | // store current color as previous for next cycle 320 | pixel_prev = pixel_F; 321 | 322 | // calculate final color 323 | pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4); 324 | 325 | // cache filtered color for next filter pass 326 | _mm_store_ps(line_buffer + 8, pixel_F); 327 | } 328 | } 329 | } 330 | 331 | ////////////////////// 332 | // left to right pass 333 | { 334 | int pixels_left = width - 1; 335 | 336 | // process 4x pixels at a time 337 | int buffer_inc = y * pitch; 338 | const __m128i* src_4xCur = (const __m128i*)(img_src + buffer_inc + 4); // shifted by 1 pixel 339 | const __m128i* src_4xPrev = (const __m128i*)(img_src + buffer_inc); 340 | 341 | // use float type only to enable 4 byte write using MOVSS 342 | float* out_result = (float*)(img_dst + buffer_inc + 4); // start at 2nd pixel from left 343 | 344 | const float* line_buffer = line_cache; 345 | 346 | /////// 347 | // handle first pixel in row separately as special case 348 | { 349 | unsigned char* first_dst = img_dst + buffer_inc; 350 | // average new pixel with one already in output 351 | // source color was pre-multipled, so get original 352 | float inv_factor = 1.f / m_inv_alpha_f; 353 | first_dst[0] = (unsigned char)((line_buffer[4] * inv_factor + line_buffer[8]) * 0.5f); 354 | first_dst[1] = (unsigned char)((line_buffer[5] * inv_factor + line_buffer[9]) * 0.5f); 355 | first_dst[2] = (unsigned char)((line_buffer[6] * inv_factor + line_buffer[10]) * 0.5f); 356 | first_dst[3] = (unsigned char)((line_buffer[7] * inv_factor + line_buffer[11]) * 0.5f); 357 | } 358 | 359 | // initialize "previous pixel" with 4 components of last row pixel 360 | __m128 pixel_prev = _mm_load_ps(line_buffer + 8); 361 | line_buffer += 12; 362 | __m128 alpha_f_prev4 = _mm_set_ps1(1.f); 363 | 364 | 365 | /////// 366 | // handle most pixels in 16 byte intervals using xmm registers 367 | while (pixels_left > 0) // outer loop 4x pixel 368 | { 369 | for (int i = 0; i <= 3 && pixels_left-- > 0; i++) // inner loop 370 | { 371 | // load cached factor 372 | __m128 alpha_f_4x = _mm_load_ps(line_buffer); 373 | line_buffer += 12; 374 | 375 | // color factor 376 | alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x); 377 | alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha); 378 | 379 | // load current source pixel, pre-multiplied 380 | __m128 pixel_F = _mm_load_ps(line_buffer + 4); 381 | 382 | 383 | // apply color filter 384 | alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x); 385 | pixel_F = _mm_add_ps(pixel_F, alpha_f_4x); 386 | 387 | // store current color as previous for next cycle 388 | pixel_prev = pixel_F; 389 | 390 | // calculate final color 391 | pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4); 392 | 393 | // average this result with result from previous pass 394 | __m128 prev_pix4 = _mm_load_ps(line_buffer + 8); 395 | 396 | pixel_F = _mm_add_ps(pixel_F, prev_pix4); 397 | pixel_F = _mm_mul_ps(pixel_F, half_value); 398 | 399 | // pack float pixel into byte pixel 400 | __m128i pixB = _mm_cvtps_epi32(pixel_F); // convert to integer 401 | pixB = _mm_shuffle_epi8(pixB, mask_pack); 402 | _mm_store_ss(out_result++, _mm_castsi128_ps(pixB)); 403 | 404 | } 405 | } 406 | } 407 | } 408 | } 409 | 410 | 411 | void CRBFilterSSE2::verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch) 412 | { 413 | int width_segment = width / m_thread_count; 414 | // make sure width segments round to 16 byte boundary except for last one 415 | width_segment -= width_segment % 4; 416 | int start_offset = width_segment * thread_index; 417 | if (thread_index == m_thread_count - 1) // last one 418 | width_segment = width - start_offset; 419 | 420 | int width4 = width_segment / 4; 421 | 422 | // adjust img buffer starting positions 423 | img_src += start_offset * 4; 424 | img_dst += start_offset * 4; 425 | 426 | float* line_cache = m_v_line_cache[thread_index]; 427 | const float* range_table = m_range_table; 428 | 429 | __m128 inv_alpha = _mm_set_ps1(m_inv_alpha_f); 430 | __m128 half_value = _mm_set_ps1(0.5f); 431 | __m128i mask_pack = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 432 | __m128i mask_unpack = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1); 433 | 434 | // used to store maximum difference between 2 pixels 435 | __declspec(align(16)) long color_diff[4]; 436 | 437 | ///////////////// 438 | // Bottom to top pass first 439 | { 440 | // last line processed separately since no previous 441 | { 442 | unsigned char* dst_line = img_dst + (height - 1) * pitch; 443 | const unsigned char* src_line = img_src + (height - 1) * pitch; 444 | float* line_buffer = line_cache; 445 | 446 | memcpy(dst_line, src_line, width_segment * 4); // copy last line 447 | 448 | // initialize line cache 449 | for (int x = 0; x < width_segment; x++) 450 | { 451 | // set factor to 1 452 | line_buffer[0] = 1.f; 453 | line_buffer[1] = 1.f; 454 | line_buffer[2] = 1.f; 455 | line_buffer[3] = 1.f; 456 | 457 | // set result color 458 | line_buffer[4] = (float)src_line[0]; 459 | line_buffer[5] = (float)src_line[1]; 460 | line_buffer[6] = (float)src_line[2]; 461 | line_buffer[7] = (float)src_line[3]; 462 | 463 | src_line += 4; 464 | line_buffer += 8; 465 | } 466 | } 467 | 468 | // process other lines 469 | for (int y = height - 2; y >= 0; y--) 470 | { 471 | float* dst_line = (float*)(img_dst + y * pitch); 472 | float* line_buffer = line_cache; 473 | 474 | __m128i* src_4xCur = (__m128i*)(img_src + y * pitch); 475 | __m128i* src_4xPrev = (__m128i*)(img_src + (y + 1) * pitch); 476 | 477 | int pixels_left = width_segment; 478 | while (pixels_left > 0) 479 | { 480 | // may read past end of buffer, but that data won't be used 481 | __m128i pix4 = _mm_loadu_si128(src_4xCur++); // load 4x pixel 482 | __m128i pix4p = _mm_loadu_si128(src_4xPrev++); 483 | 484 | // get color differences 485 | getDiffFactor3x(pix4, pix4p, (__m128i*)color_diff); 486 | 487 | for (int i = 0; i < 4 && pixels_left-- > 0; i++) // inner loop 488 | { 489 | float alpha_f = range_table[color_diff[i]]; 490 | __m128 alpha_f_4x = _mm_set_ps1(alpha_f); 491 | 492 | // load previous line color factor 493 | __m128 alpha_f_prev4 = _mm_load_ps(line_buffer); 494 | // load previous line color 495 | __m128 pixel_prev = _mm_load_ps(line_buffer + 4); 496 | 497 | // color factor 498 | alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x); 499 | alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha); 500 | 501 | // unpack current source pixel 502 | __m128i pix1 = _mm_shuffle_epi8(pix4, mask_unpack); 503 | pix4 = _mm_srli_si128(pix4, 4); // shift right 504 | __m128 pixel_F = _mm_cvtepi32_ps(pix1); // convert to floats 505 | 506 | 507 | // apply color filter 508 | pixel_F = _mm_mul_ps(pixel_F, inv_alpha); 509 | alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x); 510 | pixel_F = _mm_add_ps(pixel_F, alpha_f_4x); 511 | 512 | // store current factor and color as previous for next cycle 513 | _mm_store_ps(line_buffer, alpha_f_prev4); 514 | _mm_store_ps(line_buffer + 4, pixel_F); 515 | line_buffer += 8; 516 | 517 | // calculate final color 518 | pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4); 519 | 520 | // pack float pixel into byte pixel 521 | __m128i pixB = _mm_cvtps_epi32(pixel_F); // convert to integer 522 | pixB = _mm_shuffle_epi8(pixB, mask_pack); 523 | _mm_store_ss(dst_line++, _mm_castsi128_ps(pixB)); 524 | } 525 | } 526 | } 527 | } 528 | 529 | ///////////////// 530 | // Top to bottom pass last 531 | { 532 | mask_pack = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); 533 | 534 | // first line handled separately since no previous 535 | { 536 | unsigned char* dst_line = img_dst; 537 | const unsigned char* src_line = img_src; 538 | float* line_buffer = line_cache; 539 | 540 | for (int x = 0; x < width_segment; x++) 541 | { 542 | // average ccurrent destanation color with current source 543 | dst_line[0] = (dst_line[0] + src_line[0]) / 2; 544 | dst_line[1] = (dst_line[1] + src_line[1]) / 2; 545 | dst_line[2] = (dst_line[2] + src_line[2]) / 2; 546 | dst_line[3] = (dst_line[3] + src_line[3]) / 2; 547 | 548 | // set factor to 1 549 | line_buffer[0] = 1.f; 550 | line_buffer[1] = 1.f; 551 | line_buffer[2] = 1.f; 552 | line_buffer[3] = 1.f; 553 | 554 | // set result color 555 | line_buffer[4] = (float)src_line[0]; 556 | line_buffer[5] = (float)src_line[1]; 557 | line_buffer[6] = (float)src_line[2]; 558 | line_buffer[7] = (float)src_line[3]; 559 | 560 | dst_line += 4; 561 | src_line += 4; 562 | line_buffer += 8; 563 | } 564 | } 565 | 566 | // process other lines 567 | for (int y = 1; y < height; y++) 568 | { 569 | // const unsigned char* src_line = img_src + y * pitch; 570 | float* line_buffer = line_cache; 571 | 572 | __m128i* src_4xCur = (__m128i*)(img_src + y * pitch); 573 | __m128i* src_4xPrev = (__m128i*)(img_src + (y - 1) * pitch); 574 | __m128i* dst_4x = (__m128i*)(img_dst + y * pitch); 575 | 576 | for (int x = 0; x < width4; x++) 577 | { 578 | // get color difference 579 | __m128i pix4 = _mm_loadu_si128(src_4xCur++); // load 4x pixel 580 | __m128i pix4p = _mm_loadu_si128(src_4xPrev++); 581 | 582 | // get color differences 583 | getDiffFactor3x(pix4, pix4p, (__m128i*)color_diff); 584 | 585 | __m128i out_pix4; 586 | for (int i = 0; i < 4; i++) // inner loop 587 | { 588 | float alpha_f = range_table[color_diff[i]]; 589 | __m128 alpha_f_4x = _mm_set_ps1(alpha_f); 590 | 591 | // load previous line color factor 592 | __m128 alpha_f_prev4 = _mm_load_ps(line_buffer); 593 | // load previous line color 594 | __m128 pixel_prev = _mm_load_ps(line_buffer + 4); 595 | 596 | // color factor 597 | // alpha_f_prev = m_inv_alpha_f + alpha_f * alpha_f_prev; 598 | alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x); 599 | alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha); 600 | 601 | // unpack current source pixel 602 | __m128i pix1 = _mm_shuffle_epi8(pix4, mask_unpack); 603 | pix4 = _mm_srli_si128(pix4, 4); // shift right 604 | __m128 pixel_F = _mm_cvtepi32_ps(pix1); // convert to floats 605 | 606 | // apply color filter 607 | pixel_F = _mm_mul_ps(pixel_F, inv_alpha); 608 | alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x); 609 | pixel_F = _mm_add_ps(pixel_F, alpha_f_4x); 610 | 611 | // store current factor and color as previous for next cycle 612 | _mm_store_ps(line_buffer, alpha_f_prev4); 613 | _mm_store_ps(line_buffer + 4, pixel_F); 614 | line_buffer += 8; 615 | 616 | // calculate final color 617 | pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4); 618 | 619 | // pack float pixel into byte pixel 620 | __m128i pixB = _mm_cvtps_epi32(pixel_F); // convert to integer 621 | pixB = _mm_shuffle_epi8(pixB, mask_pack); 622 | 623 | out_pix4 = _mm_srli_si128(out_pix4, 4); // shift 624 | out_pix4 = _mm_or_si128(out_pix4, pixB); 625 | 626 | } 627 | 628 | // average result 4x pixel with what is already in destination 629 | __m128i dst4 = _mm_loadu_si128(dst_4x); 630 | out_pix4 = _mm_avg_epu8(out_pix4, dst4); 631 | _mm_storeu_si128(dst_4x++, out_pix4); // store 4x pixel 632 | } 633 | 634 | // have to handle leftover 1-3 pixels if last width segment isn't divisble by 4 635 | if (width_segment % 4) 636 | { 637 | // this should be avoided by having image buffers with pitch divisible by 16 638 | } 639 | } 640 | } 641 | 642 | } 643 | 644 | bool CRBFilterSSE2::filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch) 645 | { 646 | // basic error checking 647 | if (!m_stage_buffer[0]) 648 | return false; 649 | 650 | if (width < 16 || width > m_reserved_width) 651 | return false; 652 | 653 | if (height < 16 || height > m_reserved_height) 654 | return false; 655 | 656 | if (pitch < width * 4) 657 | return false; 658 | 659 | if (!out_data || !in_data) 660 | return false; 661 | 662 | if (m_inv_alpha_f == 0.f) 663 | return false; 664 | 665 | int thread_count_adjusted = m_thread_count - 1; 666 | 667 | ////////////////////////////////////////////// 668 | // horizontal filter divided in threads 669 | for (int i = 0; i < thread_count_adjusted; i++) 670 | { 671 | m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::horizontalFilter, this, i, in_data, m_stage_buffer[0], width, height, pitch); 672 | } 673 | 674 | // use this thread for last segment 675 | horizontalFilter(thread_count_adjusted, in_data, m_stage_buffer[0], width, height, pitch); 676 | 677 | // wait for result 678 | for (int i = 0; i < thread_count_adjusted; i++) 679 | { 680 | m_horizontal_tasks[i].get(); 681 | } 682 | 683 | ///////////////////////////////////////////// 684 | // vertical filter divided in threads 685 | for (int i = 0; i < thread_count_adjusted; i++) 686 | { 687 | m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::verticalFilter, this, i, m_stage_buffer[0], out_data, width, height, pitch); 688 | } 689 | 690 | // use this thread for last segment 691 | verticalFilter(thread_count_adjusted, m_stage_buffer[0], out_data, width, height, pitch); 692 | 693 | // wait for result 694 | for (int i = 0; i < thread_count_adjusted; i++) 695 | { 696 | m_vertical_tasks[i].get(); 697 | } 698 | 699 | return true; 700 | } 701 | 702 | bool CRBFilterSSE2::filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch) 703 | { 704 | // basic error checking 705 | if (!m_stage_buffer[0]) 706 | return false; 707 | 708 | if (width < 16 || width > m_reserved_width) 709 | return false; 710 | 711 | if (height < 16 || height > m_reserved_height) 712 | return false; 713 | 714 | if (pitch < width * 4) 715 | return false; 716 | 717 | if (m_inv_alpha_f == 0.f) 718 | return false; 719 | 720 | m_image_width = width; 721 | m_image_height = height; 722 | m_image_pitch = pitch; 723 | 724 | // block until last frame finished 1st stage 725 | for (int i = 0; i < m_thread_count; i++) 726 | { 727 | if (m_horizontal_tasks[i].valid()) 728 | m_horizontal_tasks[i].get(); 729 | } 730 | 731 | int previous_stage_index = (m_filter_counter - 1) % STAGE_BUFFER_COUNT; 732 | int current_stage_index = m_filter_counter % STAGE_BUFFER_COUNT; 733 | m_filter_counter++; 734 | m_out_buffer[current_stage_index] = out_data; 735 | 736 | // start new horizontal stage 737 | if (in_data) 738 | { 739 | // start first stage for current frame 740 | for (int i = 0; i < m_thread_count; i++) 741 | { 742 | m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::horizontalFilter, this, i, in_data, m_stage_buffer[current_stage_index], width, height, pitch); 743 | } 744 | } 745 | 746 | // block until last frame finished 2nd stage 747 | for (int i = 0; i < m_thread_count; i++) 748 | { 749 | if (m_vertical_tasks[i].valid()) 750 | m_vertical_tasks[i].get(); 751 | } 752 | 753 | // start new vertical stage based on result of previous stage 754 | if (previous_stage_index >= 0 && m_out_buffer[previous_stage_index]) 755 | { 756 | // start first stage for current frame 757 | for (int i = 0; i < m_thread_count; i++) 758 | { 759 | m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::verticalFilter, this, i, m_stage_buffer[previous_stage_index], m_out_buffer[previous_stage_index], width, height, pitch); 760 | } 761 | } 762 | 763 | return true; 764 | } 765 | 766 | void CRBFilterSSE2::filterPipeFlush() 767 | { 768 | filterPipePush(nullptr, nullptr, m_image_width, m_image_height, m_image_pitch); 769 | 770 | if (m_filter_counter > 0) 771 | { 772 | for (int i = 0; i < m_thread_count; i++) 773 | { 774 | if(m_vertical_tasks[i].valid()) 775 | m_vertical_tasks[i].get(); 776 | } 777 | } 778 | 779 | m_filter_counter = 0; 780 | } -------------------------------------------------------------------------------- /stb_image_write.h: -------------------------------------------------------------------------------- 1 | /* stb_image_write - v1.05 - public domain - http://nothings.org/stb/stb_image_write.h 2 | writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010-2015 3 | no warranty implied; use at your own risk 4 | 5 | Before #including, 6 | 7 | #define STB_IMAGE_WRITE_IMPLEMENTATION 8 | 9 | in the file that you want to have the implementation. 10 | 11 | Will probably not work correctly with strict-aliasing optimizations. 12 | 13 | ABOUT: 14 | 15 | This header file is a library for writing images to C stdio. It could be 16 | adapted to write to memory or a general streaming interface; let me know. 17 | 18 | The PNG output is not optimal; it is 20-50% larger than the file 19 | written by a decent optimizing implementation. This library is designed 20 | for source code compactness and simplicity, not optimal image file size 21 | or run-time performance. 22 | 23 | BUILDING: 24 | 25 | You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h. 26 | You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace 27 | malloc,realloc,free. 28 | You can define STBIW_MEMMOVE() to replace memmove() 29 | 30 | USAGE: 31 | 32 | There are four functions, one for each image file format: 33 | 34 | int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes); 35 | int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data); 36 | int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data); 37 | int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); 38 | 39 | There are also four equivalent functions that use an arbitrary write function. You are 40 | expected to open/close your file-equivalent before and after calling these: 41 | 42 | int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes); 43 | int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 44 | int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 45 | int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data); 46 | 47 | where the callback is: 48 | void stbi_write_func(void *context, void *data, int size); 49 | 50 | You can define STBI_WRITE_NO_STDIO to disable the file variant of these 51 | functions, so the library will not use stdio.h at all. However, this will 52 | also disable HDR writing, because it requires stdio for formatted output. 53 | 54 | Each function returns 0 on failure and non-0 on success. 55 | 56 | The functions create an image file defined by the parameters. The image 57 | is a rectangle of pixels stored from left-to-right, top-to-bottom. 58 | Each pixel contains 'comp' channels of data stored interleaved with 8-bits 59 | per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is 60 | monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall. 61 | The *data pointer points to the first byte of the top-left-most pixel. 62 | For PNG, "stride_in_bytes" is the distance in bytes from the first byte of 63 | a row of pixels to the first byte of the next row of pixels. 64 | 65 | PNG creates output files with the same number of components as the input. 66 | The BMP format expands Y to RGB in the file format and does not 67 | output alpha. 68 | 69 | PNG supports writing rectangles of data even when the bytes storing rows of 70 | data are not consecutive in memory (e.g. sub-rectangles of a larger image), 71 | by supplying the stride between the beginning of adjacent rows. The other 72 | formats do not. (Thus you cannot write a native-format BMP through the BMP 73 | writer, both because it is in BGR order and because it may have padding 74 | at the end of the line.) 75 | 76 | HDR expects linear float data. Since the format is always 32-bit rgb(e) 77 | data, alpha (if provided) is discarded, and for monochrome data it is 78 | replicated across all three channels. 79 | 80 | TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed 81 | data, set the global variable 'stbi_write_tga_with_rle' to 0. 82 | 83 | CREDITS: 84 | 85 | PNG/BMP/TGA 86 | Sean Barrett 87 | HDR 88 | Baldur Karlsson 89 | TGA monochrome: 90 | Jean-Sebastien Guay 91 | misc enhancements: 92 | Tim Kelsey 93 | TGA RLE 94 | Alan Hickman 95 | initial file IO callback implementation 96 | Emmanuel Julien 97 | bugfixes: 98 | github:Chribba 99 | Guillaume Chereau 100 | github:jry2 101 | github:romigrou 102 | Sergio Gonzalez 103 | Jonas Karlsson 104 | Filip Wasil 105 | Thatcher Ulrich 106 | github:poppolopoppo 107 | Patrick Boettcher 108 | 109 | LICENSE 110 | 111 | See end of file for license information. 112 | 113 | */ 114 | 115 | #ifndef INCLUDE_STB_IMAGE_WRITE_H 116 | #define INCLUDE_STB_IMAGE_WRITE_H 117 | 118 | #ifdef __cplusplus 119 | extern "C" { 120 | #endif 121 | 122 | #ifdef STB_IMAGE_WRITE_STATIC 123 | #define STBIWDEF static 124 | #else 125 | #define STBIWDEF extern 126 | extern int stbi_write_tga_with_rle; 127 | #endif 128 | 129 | #ifndef STBI_WRITE_NO_STDIO 130 | STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes); 131 | STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data); 132 | STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data); 133 | STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); 134 | #endif 135 | 136 | typedef void stbi_write_func(void *context, void *data, int size); 137 | 138 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes); 139 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 140 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 141 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data); 142 | 143 | #ifdef __cplusplus 144 | } 145 | #endif 146 | 147 | #endif//INCLUDE_STB_IMAGE_WRITE_H 148 | 149 | #ifdef STB_IMAGE_WRITE_IMPLEMENTATION 150 | 151 | #ifdef _WIN32 152 | #ifndef _CRT_SECURE_NO_WARNINGS 153 | #define _CRT_SECURE_NO_WARNINGS 154 | #endif 155 | #ifndef _CRT_NONSTDC_NO_DEPRECATE 156 | #define _CRT_NONSTDC_NO_DEPRECATE 157 | #endif 158 | #endif 159 | 160 | #ifndef STBI_WRITE_NO_STDIO 161 | #include 162 | #endif // STBI_WRITE_NO_STDIO 163 | 164 | #include 165 | #include 166 | #include 167 | #include 168 | 169 | #if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED)) 170 | // ok 171 | #elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED) 172 | // ok 173 | #else 174 | #error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)." 175 | #endif 176 | 177 | #ifndef STBIW_MALLOC 178 | #define STBIW_MALLOC(sz) malloc(sz) 179 | #define STBIW_REALLOC(p,newsz) realloc(p,newsz) 180 | #define STBIW_FREE(p) free(p) 181 | #endif 182 | 183 | #ifndef STBIW_REALLOC_SIZED 184 | #define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz) 185 | #endif 186 | 187 | 188 | #ifndef STBIW_MEMMOVE 189 | #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz) 190 | #endif 191 | 192 | 193 | #ifndef STBIW_ASSERT 194 | #include 195 | #define STBIW_ASSERT(x) assert(x) 196 | #endif 197 | 198 | #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff) 199 | 200 | typedef struct 201 | { 202 | stbi_write_func *func; 203 | void *context; 204 | } stbi__write_context; 205 | 206 | // initialize a callback-based context 207 | static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context) 208 | { 209 | s->func = c; 210 | s->context = context; 211 | } 212 | 213 | #ifndef STBI_WRITE_NO_STDIO 214 | 215 | static void stbi__stdio_write(void *context, void *data, int size) 216 | { 217 | fwrite(data,1,size,(FILE*) context); 218 | } 219 | 220 | static int stbi__start_write_file(stbi__write_context *s, const char *filename) 221 | { 222 | FILE *f = fopen(filename, "wb"); 223 | stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f); 224 | return f != NULL; 225 | } 226 | 227 | static void stbi__end_write_file(stbi__write_context *s) 228 | { 229 | fclose((FILE *)s->context); 230 | } 231 | 232 | #endif // !STBI_WRITE_NO_STDIO 233 | 234 | typedef unsigned int stbiw_uint32; 235 | typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1]; 236 | 237 | #ifdef STB_IMAGE_WRITE_STATIC 238 | static int stbi_write_tga_with_rle = 1; 239 | #else 240 | int stbi_write_tga_with_rle = 1; 241 | #endif 242 | 243 | static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) 244 | { 245 | while (*fmt) { 246 | switch (*fmt++) { 247 | case ' ': break; 248 | case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int)); 249 | s->func(s->context,&x,1); 250 | break; } 251 | case '2': { int x = va_arg(v,int); 252 | unsigned char b[2]; 253 | b[0] = STBIW_UCHAR(x); 254 | b[1] = STBIW_UCHAR(x>>8); 255 | s->func(s->context,b,2); 256 | break; } 257 | case '4': { stbiw_uint32 x = va_arg(v,int); 258 | unsigned char b[4]; 259 | b[0]=STBIW_UCHAR(x); 260 | b[1]=STBIW_UCHAR(x>>8); 261 | b[2]=STBIW_UCHAR(x>>16); 262 | b[3]=STBIW_UCHAR(x>>24); 263 | s->func(s->context,b,4); 264 | break; } 265 | default: 266 | STBIW_ASSERT(0); 267 | return; 268 | } 269 | } 270 | } 271 | 272 | static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) 273 | { 274 | va_list v; 275 | va_start(v, fmt); 276 | stbiw__writefv(s, fmt, v); 277 | va_end(v); 278 | } 279 | 280 | static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c) 281 | { 282 | unsigned char arr[3]; 283 | arr[0] = a, arr[1] = b, arr[2] = c; 284 | s->func(s->context, arr, 3); 285 | } 286 | 287 | static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d) 288 | { 289 | unsigned char bg[3] = { 255, 0, 255}, px[3]; 290 | int k; 291 | 292 | if (write_alpha < 0) 293 | s->func(s->context, &d[comp - 1], 1); 294 | 295 | switch (comp) { 296 | case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case 297 | case 1: 298 | if (expand_mono) 299 | stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp 300 | else 301 | s->func(s->context, d, 1); // monochrome TGA 302 | break; 303 | case 4: 304 | if (!write_alpha) { 305 | // composite against pink background 306 | for (k = 0; k < 3; ++k) 307 | px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; 308 | stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); 309 | break; 310 | } 311 | /* FALLTHROUGH */ 312 | case 3: 313 | stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); 314 | break; 315 | } 316 | if (write_alpha > 0) 317 | s->func(s->context, &d[comp - 1], 1); 318 | } 319 | 320 | static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono) 321 | { 322 | stbiw_uint32 zero = 0; 323 | int i,j, j_end; 324 | 325 | if (y <= 0) 326 | return; 327 | 328 | if (vdir < 0) 329 | j_end = -1, j = y-1; 330 | else 331 | j_end = y, j = 0; 332 | 333 | for (; j != j_end; j += vdir) { 334 | for (i=0; i < x; ++i) { 335 | unsigned char *d = (unsigned char *) data + (j*x+i)*comp; 336 | stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d); 337 | } 338 | s->func(s->context, &zero, scanline_pad); 339 | } 340 | } 341 | 342 | static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...) 343 | { 344 | if (y < 0 || x < 0) { 345 | return 0; 346 | } else { 347 | va_list v; 348 | va_start(v, fmt); 349 | stbiw__writefv(s, fmt, v); 350 | va_end(v); 351 | stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono); 352 | return 1; 353 | } 354 | } 355 | 356 | static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data) 357 | { 358 | int pad = (-x*3) & 3; 359 | return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad, 360 | "11 4 22 4" "4 44 22 444444", 361 | 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header 362 | 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header 363 | } 364 | 365 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) 366 | { 367 | stbi__write_context s; 368 | stbi__start_write_callbacks(&s, func, context); 369 | return stbi_write_bmp_core(&s, x, y, comp, data); 370 | } 371 | 372 | #ifndef STBI_WRITE_NO_STDIO 373 | STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data) 374 | { 375 | stbi__write_context s; 376 | if (stbi__start_write_file(&s,filename)) { 377 | int r = stbi_write_bmp_core(&s, x, y, comp, data); 378 | stbi__end_write_file(&s); 379 | return r; 380 | } else 381 | return 0; 382 | } 383 | #endif //!STBI_WRITE_NO_STDIO 384 | 385 | static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data) 386 | { 387 | int has_alpha = (comp == 2 || comp == 4); 388 | int colorbytes = has_alpha ? comp-1 : comp; 389 | int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 390 | 391 | if (y < 0 || x < 0) 392 | return 0; 393 | 394 | if (!stbi_write_tga_with_rle) { 395 | return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0, 396 | "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8); 397 | } else { 398 | int i,j,k; 399 | 400 | stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8); 401 | 402 | for (j = y - 1; j >= 0; --j) { 403 | unsigned char *row = (unsigned char *) data + j * x * comp; 404 | int len; 405 | 406 | for (i = 0; i < x; i += len) { 407 | unsigned char *begin = row + i * comp; 408 | int diff = 1; 409 | len = 1; 410 | 411 | if (i < x - 1) { 412 | ++len; 413 | diff = memcmp(begin, row + (i + 1) * comp, comp); 414 | if (diff) { 415 | const unsigned char *prev = begin; 416 | for (k = i + 2; k < x && len < 128; ++k) { 417 | if (memcmp(prev, row + k * comp, comp)) { 418 | prev += comp; 419 | ++len; 420 | } else { 421 | --len; 422 | break; 423 | } 424 | } 425 | } else { 426 | for (k = i + 2; k < x && len < 128; ++k) { 427 | if (!memcmp(begin, row + k * comp, comp)) { 428 | ++len; 429 | } else { 430 | break; 431 | } 432 | } 433 | } 434 | } 435 | 436 | if (diff) { 437 | unsigned char header = STBIW_UCHAR(len - 1); 438 | s->func(s->context, &header, 1); 439 | for (k = 0; k < len; ++k) { 440 | stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp); 441 | } 442 | } else { 443 | unsigned char header = STBIW_UCHAR(len - 129); 444 | s->func(s->context, &header, 1); 445 | stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin); 446 | } 447 | } 448 | } 449 | } 450 | return 1; 451 | } 452 | 453 | int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) 454 | { 455 | stbi__write_context s; 456 | stbi__start_write_callbacks(&s, func, context); 457 | return stbi_write_tga_core(&s, x, y, comp, (void *) data); 458 | } 459 | 460 | #ifndef STBI_WRITE_NO_STDIO 461 | int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data) 462 | { 463 | stbi__write_context s; 464 | if (stbi__start_write_file(&s,filename)) { 465 | int r = stbi_write_tga_core(&s, x, y, comp, (void *) data); 466 | stbi__end_write_file(&s); 467 | return r; 468 | } else 469 | return 0; 470 | } 471 | #endif 472 | 473 | // ************************************************************************************************* 474 | // Radiance RGBE HDR writer 475 | // by Baldur Karlsson 476 | 477 | #define stbiw__max(a, b) ((a) > (b) ? (a) : (b)) 478 | 479 | void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) 480 | { 481 | int exponent; 482 | float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2])); 483 | 484 | if (maxcomp < 1e-32f) { 485 | rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0; 486 | } else { 487 | float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp; 488 | 489 | rgbe[0] = (unsigned char)(linear[0] * normalize); 490 | rgbe[1] = (unsigned char)(linear[1] * normalize); 491 | rgbe[2] = (unsigned char)(linear[2] * normalize); 492 | rgbe[3] = (unsigned char)(exponent + 128); 493 | } 494 | } 495 | 496 | void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte) 497 | { 498 | unsigned char lengthbyte = STBIW_UCHAR(length+128); 499 | STBIW_ASSERT(length+128 <= 255); 500 | s->func(s->context, &lengthbyte, 1); 501 | s->func(s->context, &databyte, 1); 502 | } 503 | 504 | void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data) 505 | { 506 | unsigned char lengthbyte = STBIW_UCHAR(length); 507 | STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code 508 | s->func(s->context, &lengthbyte, 1); 509 | s->func(s->context, data, length); 510 | } 511 | 512 | void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline) 513 | { 514 | unsigned char scanlineheader[4] = { 2, 2, 0, 0 }; 515 | unsigned char rgbe[4]; 516 | float linear[3]; 517 | int x; 518 | 519 | scanlineheader[2] = (width&0xff00)>>8; 520 | scanlineheader[3] = (width&0x00ff); 521 | 522 | /* skip RLE for images too small or large */ 523 | if (width < 8 || width >= 32768) { 524 | for (x=0; x < width; x++) { 525 | switch (ncomp) { 526 | case 4: /* fallthrough */ 527 | case 3: linear[2] = scanline[x*ncomp + 2]; 528 | linear[1] = scanline[x*ncomp + 1]; 529 | linear[0] = scanline[x*ncomp + 0]; 530 | break; 531 | default: 532 | linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0]; 533 | break; 534 | } 535 | stbiw__linear_to_rgbe(rgbe, linear); 536 | s->func(s->context, rgbe, 4); 537 | } 538 | } else { 539 | int c,r; 540 | /* encode into scratch buffer */ 541 | for (x=0; x < width; x++) { 542 | switch(ncomp) { 543 | case 4: /* fallthrough */ 544 | case 3: linear[2] = scanline[x*ncomp + 2]; 545 | linear[1] = scanline[x*ncomp + 1]; 546 | linear[0] = scanline[x*ncomp + 0]; 547 | break; 548 | default: 549 | linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0]; 550 | break; 551 | } 552 | stbiw__linear_to_rgbe(rgbe, linear); 553 | scratch[x + width*0] = rgbe[0]; 554 | scratch[x + width*1] = rgbe[1]; 555 | scratch[x + width*2] = rgbe[2]; 556 | scratch[x + width*3] = rgbe[3]; 557 | } 558 | 559 | s->func(s->context, scanlineheader, 4); 560 | 561 | /* RLE each component separately */ 562 | for (c=0; c < 4; c++) { 563 | unsigned char *comp = &scratch[width*c]; 564 | 565 | x = 0; 566 | while (x < width) { 567 | // find first run 568 | r = x; 569 | while (r+2 < width) { 570 | if (comp[r] == comp[r+1] && comp[r] == comp[r+2]) 571 | break; 572 | ++r; 573 | } 574 | if (r+2 >= width) 575 | r = width; 576 | // dump up to first run 577 | while (x < r) { 578 | int len = r-x; 579 | if (len > 128) len = 128; 580 | stbiw__write_dump_data(s, len, &comp[x]); 581 | x += len; 582 | } 583 | // if there's a run, output it 584 | if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd 585 | // find next byte after run 586 | while (r < width && comp[r] == comp[x]) 587 | ++r; 588 | // output run up to r 589 | while (x < r) { 590 | int len = r-x; 591 | if (len > 127) len = 127; 592 | stbiw__write_run_data(s, len, comp[x]); 593 | x += len; 594 | } 595 | } 596 | } 597 | } 598 | } 599 | } 600 | 601 | static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data) 602 | { 603 | if (y <= 0 || x <= 0 || data == NULL) 604 | return 0; 605 | else { 606 | // Each component is stored separately. Allocate scratch space for full output scanline. 607 | unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4); 608 | int i, len; 609 | char buffer[128]; 610 | char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; 611 | s->func(s->context, header, sizeof(header)-1); 612 | 613 | len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); 614 | s->func(s->context, buffer, len); 615 | 616 | for(i=0; i < y; i++) 617 | stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*i*x); 618 | STBIW_FREE(scratch); 619 | return 1; 620 | } 621 | } 622 | 623 | int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data) 624 | { 625 | stbi__write_context s; 626 | stbi__start_write_callbacks(&s, func, context); 627 | return stbi_write_hdr_core(&s, x, y, comp, (float *) data); 628 | } 629 | 630 | #ifndef STBI_WRITE_NO_STDIO 631 | int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data) 632 | { 633 | stbi__write_context s; 634 | if (stbi__start_write_file(&s,filename)) { 635 | int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data); 636 | stbi__end_write_file(&s); 637 | return r; 638 | } else 639 | return 0; 640 | } 641 | #endif // STBI_WRITE_NO_STDIO 642 | 643 | 644 | ////////////////////////////////////////////////////////////////////////////// 645 | // 646 | // PNG writer 647 | // 648 | 649 | // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size() 650 | #define stbiw__sbraw(a) ((int *) (a) - 2) 651 | #define stbiw__sbm(a) stbiw__sbraw(a)[0] 652 | #define stbiw__sbn(a) stbiw__sbraw(a)[1] 653 | 654 | #define stbiw__sbneedgrow(a,n) ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a)) 655 | #define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0) 656 | #define stbiw__sbgrow(a,n) stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a))) 657 | 658 | #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v)) 659 | #define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) 660 | #define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0) 661 | 662 | static void *stbiw__sbgrowf(void **arr, int increment, int itemsize) 663 | { 664 | int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1; 665 | void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2); 666 | STBIW_ASSERT(p); 667 | if (p) { 668 | if (!*arr) ((int *) p)[1] = 0; 669 | *arr = (void *) ((int *) p + 2); 670 | stbiw__sbm(*arr) = m; 671 | } 672 | return *arr; 673 | } 674 | 675 | static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount) 676 | { 677 | while (*bitcount >= 8) { 678 | stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer)); 679 | *bitbuffer >>= 8; 680 | *bitcount -= 8; 681 | } 682 | return data; 683 | } 684 | 685 | static int stbiw__zlib_bitrev(int code, int codebits) 686 | { 687 | int res=0; 688 | while (codebits--) { 689 | res = (res << 1) | (code & 1); 690 | code >>= 1; 691 | } 692 | return res; 693 | } 694 | 695 | static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit) 696 | { 697 | int i; 698 | for (i=0; i < limit && i < 258; ++i) 699 | if (a[i] != b[i]) break; 700 | return i; 701 | } 702 | 703 | static unsigned int stbiw__zhash(unsigned char *data) 704 | { 705 | stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16); 706 | hash ^= hash << 3; 707 | hash += hash >> 5; 708 | hash ^= hash << 4; 709 | hash += hash >> 17; 710 | hash ^= hash << 25; 711 | hash += hash >> 6; 712 | return hash; 713 | } 714 | 715 | #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) 716 | #define stbiw__zlib_add(code,codebits) \ 717 | (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush()) 718 | #define stbiw__zlib_huffa(b,c) stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c) 719 | // default huffman tables 720 | #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8) 721 | #define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9) 722 | #define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256,7) 723 | #define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280,8) 724 | #define stbiw__zlib_huff(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n)) 725 | #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n)) 726 | 727 | #define stbiw__ZHASH 16384 728 | 729 | unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality) 730 | { 731 | static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 }; 732 | static unsigned char lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 }; 733 | static unsigned short distc[] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 }; 734 | static unsigned char disteb[] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 }; 735 | unsigned int bitbuf=0; 736 | int i,j, bitcount=0; 737 | unsigned char *out = NULL; 738 | unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**)); 739 | if (quality < 5) quality = 5; 740 | 741 | stbiw__sbpush(out, 0x78); // DEFLATE 32K window 742 | stbiw__sbpush(out, 0x5e); // FLEVEL = 1 743 | stbiw__zlib_add(1,1); // BFINAL = 1 744 | stbiw__zlib_add(1,2); // BTYPE = 1 -- fixed huffman 745 | 746 | for (i=0; i < stbiw__ZHASH; ++i) 747 | hash_table[i] = NULL; 748 | 749 | i=0; 750 | while (i < data_len-3) { 751 | // hash next 3 bytes of data to be compressed 752 | int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3; 753 | unsigned char *bestloc = 0; 754 | unsigned char **hlist = hash_table[h]; 755 | int n = stbiw__sbcount(hlist); 756 | for (j=0; j < n; ++j) { 757 | if (hlist[j]-data > i-32768) { // if entry lies within window 758 | int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i); 759 | if (d >= best) best=d,bestloc=hlist[j]; 760 | } 761 | } 762 | // when hash table entry is too long, delete half the entries 763 | if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) { 764 | STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality); 765 | stbiw__sbn(hash_table[h]) = quality; 766 | } 767 | stbiw__sbpush(hash_table[h],data+i); 768 | 769 | if (bestloc) { 770 | // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal 771 | h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1); 772 | hlist = hash_table[h]; 773 | n = stbiw__sbcount(hlist); 774 | for (j=0; j < n; ++j) { 775 | if (hlist[j]-data > i-32767) { 776 | int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1); 777 | if (e > best) { // if next match is better, bail on current match 778 | bestloc = NULL; 779 | break; 780 | } 781 | } 782 | } 783 | } 784 | 785 | if (bestloc) { 786 | int d = (int) (data+i - bestloc); // distance back 787 | STBIW_ASSERT(d <= 32767 && best <= 258); 788 | for (j=0; best > lengthc[j+1]-1; ++j); 789 | stbiw__zlib_huff(j+257); 790 | if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]); 791 | for (j=0; d > distc[j+1]-1; ++j); 792 | stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5); 793 | if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]); 794 | i += best; 795 | } else { 796 | stbiw__zlib_huffb(data[i]); 797 | ++i; 798 | } 799 | } 800 | // write out final bytes 801 | for (;i < data_len; ++i) 802 | stbiw__zlib_huffb(data[i]); 803 | stbiw__zlib_huff(256); // end of block 804 | // pad with 0 bits to byte boundary 805 | while (bitcount) 806 | stbiw__zlib_add(0,1); 807 | 808 | for (i=0; i < stbiw__ZHASH; ++i) 809 | (void) stbiw__sbfree(hash_table[i]); 810 | STBIW_FREE(hash_table); 811 | 812 | { 813 | // compute adler32 on input 814 | unsigned int s1=1, s2=0; 815 | int blocklen = (int) (data_len % 5552); 816 | j=0; 817 | while (j < data_len) { 818 | for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1; 819 | s1 %= 65521, s2 %= 65521; 820 | j += blocklen; 821 | blocklen = 5552; 822 | } 823 | stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8)); 824 | stbiw__sbpush(out, STBIW_UCHAR(s2)); 825 | stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8)); 826 | stbiw__sbpush(out, STBIW_UCHAR(s1)); 827 | } 828 | *out_len = stbiw__sbn(out); 829 | // make returned pointer freeable 830 | STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len); 831 | return (unsigned char *) stbiw__sbraw(out); 832 | } 833 | 834 | static unsigned int stbiw__crc32(unsigned char *buffer, int len) 835 | { 836 | static unsigned int crc_table[256] = 837 | { 838 | 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, 839 | 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 840 | 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 841 | 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 842 | 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 843 | 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 844 | 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, 845 | 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 846 | 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, 847 | 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, 848 | 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 849 | 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 850 | 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 851 | 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 852 | 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, 853 | 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, 854 | 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 855 | 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 856 | 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 857 | 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 858 | 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, 859 | 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, 860 | 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 861 | 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 862 | 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 863 | 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 864 | 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 865 | 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, 866 | 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 867 | 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, 868 | 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, 869 | 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D 870 | }; 871 | 872 | unsigned int crc = ~0u; 873 | int i; 874 | for (i=0; i < len; ++i) 875 | crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)]; 876 | return ~crc; 877 | } 878 | 879 | #define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4) 880 | #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v)); 881 | #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3]) 882 | 883 | static void stbiw__wpcrc(unsigned char **data, int len) 884 | { 885 | unsigned int crc = stbiw__crc32(*data - len - 4, len+4); 886 | stbiw__wp32(*data, crc); 887 | } 888 | 889 | static unsigned char stbiw__paeth(int a, int b, int c) 890 | { 891 | int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c); 892 | if (pa <= pb && pa <= pc) return STBIW_UCHAR(a); 893 | if (pb <= pc) return STBIW_UCHAR(b); 894 | return STBIW_UCHAR(c); 895 | } 896 | 897 | // @OPTIMIZE: provide an option that always forces left-predict or paeth predict 898 | unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len) 899 | { 900 | int ctype[5] = { -1, 0, 4, 2, 6 }; 901 | unsigned char sig[8] = { 137,80,78,71,13,10,26,10 }; 902 | unsigned char *out,*o, *filt, *zlib; 903 | signed char *line_buffer; 904 | int i,j,k,p,zlen; 905 | 906 | if (stride_bytes == 0) 907 | stride_bytes = x * n; 908 | 909 | filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0; 910 | line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; } 911 | for (j=0; j < y; ++j) { 912 | static int mapping[] = { 0,1,2,3,4 }; 913 | static int firstmap[] = { 0,1,0,5,6 }; 914 | int *mymap = (j != 0) ? mapping : firstmap; 915 | int best = 0, bestval = 0x7fffffff; 916 | for (p=0; p < 2; ++p) { 917 | for (k= p?best:0; k < 5; ++k) { // @TODO: clarity: rewrite this to go 0..5, and 'continue' the unwanted ones during 2nd pass 918 | int type = mymap[k],est=0; 919 | unsigned char *z = pixels + stride_bytes*j; 920 | for (i=0; i < n; ++i) 921 | switch (type) { 922 | case 0: line_buffer[i] = z[i]; break; 923 | case 1: line_buffer[i] = z[i]; break; 924 | case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break; 925 | case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break; 926 | case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break; 927 | case 5: line_buffer[i] = z[i]; break; 928 | case 6: line_buffer[i] = z[i]; break; 929 | } 930 | for (i=n; i < x*n; ++i) { 931 | switch (type) { 932 | case 0: line_buffer[i] = z[i]; break; 933 | case 1: line_buffer[i] = z[i] - z[i-n]; break; 934 | case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break; 935 | case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break; 936 | case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break; 937 | case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break; 938 | case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break; 939 | } 940 | } 941 | if (p) break; 942 | for (i=0; i < x*n; ++i) 943 | est += abs((signed char) line_buffer[i]); 944 | if (est < bestval) { bestval = est; best = k; } 945 | } 946 | } 947 | // when we get here, best contains the filter type, and line_buffer contains the data 948 | filt[j*(x*n+1)] = (unsigned char) best; 949 | STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n); 950 | } 951 | STBIW_FREE(line_buffer); 952 | zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory 953 | STBIW_FREE(filt); 954 | if (!zlib) return 0; 955 | 956 | // each tag requires 12 bytes of overhead 957 | out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12); 958 | if (!out) return 0; 959 | *out_len = 8 + 12+13 + 12+zlen + 12; 960 | 961 | o=out; 962 | STBIW_MEMMOVE(o,sig,8); o+= 8; 963 | stbiw__wp32(o, 13); // header length 964 | stbiw__wptag(o, "IHDR"); 965 | stbiw__wp32(o, x); 966 | stbiw__wp32(o, y); 967 | *o++ = 8; 968 | *o++ = STBIW_UCHAR(ctype[n]); 969 | *o++ = 0; 970 | *o++ = 0; 971 | *o++ = 0; 972 | stbiw__wpcrc(&o,13); 973 | 974 | stbiw__wp32(o, zlen); 975 | stbiw__wptag(o, "IDAT"); 976 | STBIW_MEMMOVE(o, zlib, zlen); 977 | o += zlen; 978 | STBIW_FREE(zlib); 979 | stbiw__wpcrc(&o, zlen); 980 | 981 | stbiw__wp32(o,0); 982 | stbiw__wptag(o, "IEND"); 983 | stbiw__wpcrc(&o,0); 984 | 985 | STBIW_ASSERT(o == out + *out_len); 986 | 987 | return out; 988 | } 989 | 990 | #ifndef STBI_WRITE_NO_STDIO 991 | STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes) 992 | { 993 | FILE *f; 994 | int len; 995 | unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len); 996 | if (png == NULL) return 0; 997 | f = fopen(filename, "wb"); 998 | if (!f) { STBIW_FREE(png); return 0; } 999 | fwrite(png, 1, len, f); 1000 | fclose(f); 1001 | STBIW_FREE(png); 1002 | return 1; 1003 | } 1004 | #endif 1005 | 1006 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes) 1007 | { 1008 | int len; 1009 | unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len); 1010 | if (png == NULL) return 0; 1011 | func(context, png, len); 1012 | STBIW_FREE(png); 1013 | return 1; 1014 | } 1015 | 1016 | #endif // STB_IMAGE_WRITE_IMPLEMENTATION 1017 | 1018 | /* Revision history 1019 | 1.04 (2017-03-03) 1020 | monochrome BMP expansion 1021 | 1.03 ??? 1022 | 1.02 (2016-04-02) 1023 | avoid allocating large structures on the stack 1024 | 1.01 (2016-01-16) 1025 | STBIW_REALLOC_SIZED: support allocators with no realloc support 1026 | avoid race-condition in crc initialization 1027 | minor compile issues 1028 | 1.00 (2015-09-14) 1029 | installable file IO function 1030 | 0.99 (2015-09-13) 1031 | warning fixes; TGA rle support 1032 | 0.98 (2015-04-08) 1033 | added STBIW_MALLOC, STBIW_ASSERT etc 1034 | 0.97 (2015-01-18) 1035 | fixed HDR asserts, rewrote HDR rle logic 1036 | 0.96 (2015-01-17) 1037 | add HDR output 1038 | fix monochrome BMP 1039 | 0.95 (2014-08-17) 1040 | add monochrome TGA output 1041 | 0.94 (2014-05-31) 1042 | rename private functions to avoid conflicts with stb_image.h 1043 | 0.93 (2014-05-27) 1044 | warning fixes 1045 | 0.92 (2010-08-01) 1046 | casts to unsigned char to fix warnings 1047 | 0.91 (2010-07-17) 1048 | first public release 1049 | 0.90 first internal release 1050 | */ 1051 | 1052 | /* 1053 | ------------------------------------------------------------------------------ 1054 | This software is available under 2 licenses -- choose whichever you prefer. 1055 | ------------------------------------------------------------------------------ 1056 | ALTERNATIVE A - MIT License 1057 | Copyright (c) 2017 Sean Barrett 1058 | Permission is hereby granted, free of charge, to any person obtaining a copy of 1059 | this software and associated documentation files (the "Software"), to deal in 1060 | the Software without restriction, including without limitation the rights to 1061 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 1062 | of the Software, and to permit persons to whom the Software is furnished to do 1063 | so, subject to the following conditions: 1064 | The above copyright notice and this permission notice shall be included in all 1065 | copies or substantial portions of the Software. 1066 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1067 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1068 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1069 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1070 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1071 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1072 | SOFTWARE. 1073 | ------------------------------------------------------------------------------ 1074 | ALTERNATIVE B - Public Domain (www.unlicense.org) 1075 | This is free and unencumbered software released into the public domain. 1076 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 1077 | software, either in source code form or as a compiled binary, for any purpose, 1078 | commercial or non-commercial, and by any means. 1079 | In jurisdictions that recognize copyright laws, the author or authors of this 1080 | software dedicate any and all copyright interest in the software to the public 1081 | domain. We make this dedication for the benefit of the public at large and to 1082 | the detriment of our heirs and successors. We intend this dedication to be an 1083 | overt act of relinquishment in perpetuity of all present and future rights to 1084 | this software under copyright law. 1085 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1086 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1087 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1088 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 1089 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 1090 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 1091 | ------------------------------------------------------------------------------ 1092 | */ 1093 | -------------------------------------------------------------------------------- /RBFilter_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include "RBFilter_AVX2.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | #define MAX_RANGE_TABLE_SIZE 255 14 | #define ALIGN_SIZE 32 15 | 16 | // only 1 of following 2 should be defined 17 | #define EDGE_COLOR_USE_MAXIMUM 18 | //#define EDGE_COLOR_USE_ADDITION 19 | 20 | // if EDGE_COLOR_USE_MAXIMUM is defined, then edge color detection works by calculating 21 | // maximum difference among 3 components (RGB) of 2 colors, which tends to result in lower differences (since only largest among 3 is selected) 22 | // if EDGE_COLOR_USE_ADDITION is defined, then edge color detection works by calculating 23 | // sum of all 3 components, while enforcing 255 maximum. This method is much more sensitive to small differences 24 | 25 | #if defined(EDGE_COLOR_USE_MAXIMUM) && defined(EDGE_COLOR_USE_ADDITION) 26 | #error Only 1 of those can be defined 27 | #endif 28 | 29 | #if !defined(EDGE_COLOR_USE_MAXIMUM) && !defined(EDGE_COLOR_USE_ADDITION) 30 | #error 1 of those must be defined 31 | #endif 32 | 33 | CRBFilterAVX2::CRBFilterAVX2() 34 | { 35 | m_range_table = new float[MAX_RANGE_TABLE_SIZE + 1]; 36 | memset(m_range_table, 0, (MAX_RANGE_TABLE_SIZE + 1) * sizeof(float)); 37 | } 38 | 39 | CRBFilterAVX2::~CRBFilterAVX2() 40 | { 41 | release(); 42 | 43 | delete[] m_range_table; 44 | } 45 | 46 | bool CRBFilterAVX2::initialize(int width, int height, int thread_count, bool pipelined) 47 | { 48 | // basic sanity check, not strict 49 | if (width < 16 || width > 10000) 50 | return false; 51 | 52 | if (height < 2 || height > 10000) 53 | return false; 54 | 55 | if (thread_count < 1 || thread_count > RBF_MAX_THREADS) 56 | return false; 57 | 58 | release(); 59 | 60 | m_thread_count = thread_count; 61 | 62 | // round height to nearest even number 63 | if (height & 1) 64 | height++; 65 | 66 | m_reserved_width = getOptimalPitch(width) / 4; 67 | m_reserved_height = height; 68 | 69 | 70 | m_stage_buffer[0] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE); 71 | if (!m_stage_buffer[0]) 72 | return false; 73 | 74 | if (pipelined) 75 | { 76 | for (int i = 1; i < STAGE_BUFFER_COUNT; i++) 77 | { 78 | m_stage_buffer[i] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE); 79 | if (!m_stage_buffer[i]) 80 | return false; 81 | } 82 | } 83 | 84 | ///////////////// 85 | m_h_line_cache = new (std::nothrow) float*[m_thread_count]; 86 | if (!m_h_line_cache) 87 | return false; 88 | 89 | // zero just in case 90 | for (int i = 0; i < m_thread_count; i++) 91 | m_h_line_cache[i] = nullptr; 92 | 93 | for (int i = 0; i < m_thread_count; i++) 94 | { 95 | m_h_line_cache[i] = (float*)_aligned_malloc(m_reserved_width * 12 * sizeof(float) * 2 + 128, ALIGN_SIZE); 96 | if (!m_h_line_cache[i]) 97 | return false; 98 | 99 | // 1st 8 bytes of line cache should remain constant zero 100 | memset(m_h_line_cache[i], 0, 8 * sizeof(float)); 101 | } 102 | 103 | //////////////// 104 | m_v_line_cache = new (std::nothrow) float*[m_thread_count]; 105 | if (!m_v_line_cache) 106 | return false; 107 | 108 | for (int i = 0; i < m_thread_count; i++) 109 | m_v_line_cache[i] = nullptr; 110 | 111 | int v_line_size = (m_reserved_width * 16 * sizeof(float)) / m_thread_count; 112 | for (int i = 0; i < m_thread_count; i++) 113 | { 114 | m_v_line_cache[i] = (float*)_aligned_malloc(v_line_size, ALIGN_SIZE); 115 | if (!m_v_line_cache[i]) 116 | return false; 117 | } 118 | 119 | return true; 120 | } 121 | 122 | void CRBFilterAVX2::release() 123 | { 124 | for (int i = 0; i < STAGE_BUFFER_COUNT; i++) 125 | { 126 | if (m_stage_buffer[i]) 127 | { 128 | _aligned_free(m_stage_buffer[i]); 129 | m_stage_buffer[i] = nullptr; 130 | } 131 | } 132 | 133 | if (m_h_line_cache) 134 | { 135 | for (int i = 0; i < m_thread_count; i++) 136 | { 137 | if (m_h_line_cache[i]) 138 | _aligned_free(m_h_line_cache[i]); 139 | } 140 | delete[] m_h_line_cache; 141 | m_h_line_cache = nullptr; 142 | } 143 | 144 | if (m_v_line_cache) 145 | { 146 | for (int i = 0; i < m_thread_count; i++) 147 | { 148 | if (m_v_line_cache[i]) 149 | _aligned_free(m_v_line_cache[i]); 150 | } 151 | delete[] m_v_line_cache; 152 | m_v_line_cache = nullptr; 153 | } 154 | 155 | m_reserved_width = 0; 156 | m_reserved_height = 0; 157 | m_thread_count = 0; 158 | m_pipelined = false; 159 | m_filter_counter = 0; 160 | } 161 | 162 | int CRBFilterAVX2::getOptimalPitch(int width) const 163 | { 164 | width *= 4; 165 | 166 | int round_up = ALIGN_SIZE * m_thread_count; 167 | if (width % round_up) 168 | { 169 | width += round_up - width % round_up; 170 | } 171 | 172 | return width; 173 | } 174 | 175 | void CRBFilterAVX2::setSigma(float sigma_spatial, float sigma_range) 176 | { 177 | if (m_sigma_spatial != sigma_spatial || m_sigma_range != sigma_range) 178 | { 179 | m_sigma_spatial = sigma_spatial; 180 | m_sigma_range = sigma_range; 181 | 182 | double alpha_f = (exp(-sqrt(2.0) / (sigma_spatial * 255.0))); 183 | m_inv_alpha_f = (float)(1.0 - alpha_f); 184 | double inv_sigma_range = 1.0 / (sigma_range * MAX_RANGE_TABLE_SIZE); 185 | { 186 | double ii = 0.f; 187 | for (int i = 0; i <= MAX_RANGE_TABLE_SIZE; i++, ii -= 1.0) 188 | { 189 | m_range_table[i] = (float)(alpha_f * exp(ii * inv_sigma_range)); 190 | } 191 | } 192 | } 193 | } 194 | 195 | // example of edge color difference calculation from original implementation 196 | // idea is to fit maximum edge color difference as single number in 0-255 range 197 | // colors are added then 2 components are scaled 4x while 1 complement is scaled 2x 198 | // this means 1 of the components is more dominant 199 | 200 | //int getDiffFactor(const unsigned char* color1, const unsigned char* color2) 201 | //{ 202 | // int c1 = abs(color1[0] - color2[0]); 203 | // int c2 = abs(color1[1] - color2[1]); 204 | // int c3 = abs(color1[2] - color2[2]); 205 | // 206 | // return ((c1 + c3) >> 2) + (c2 >> 1); 207 | //} 208 | 209 | 210 | inline void getDiffFactor3x(__m256i pix8, __m256i pix8p, __m256i* diff8x) 211 | { 212 | __m256i byte_mask = _mm256_set1_epi32(255); 213 | 214 | // get absolute difference for each component per pixel 215 | __m256i diff = _mm256_sub_epi8(_mm256_max_epu8(pix8, pix8p), _mm256_min_epu8(pix8, pix8p)); 216 | 217 | #ifdef EDGE_COLOR_USE_MAXIMUM 218 | // get maximum of 3 components 219 | __m256i diff_shift1 = _mm256_srli_epi32(diff, 8); // 2nd component 220 | diff = _mm256_max_epu8(diff, diff_shift1); 221 | diff_shift1 = _mm256_srli_epi32(diff_shift1, 8); // 3rd component 222 | diff = _mm256_max_epu8(diff, diff_shift1); 223 | // skip alpha component 224 | diff = _mm256_and_si256(diff, byte_mask); // zero out all but 1st byte 225 | #endif 226 | 227 | #ifdef EDGE_COLOR_USE_ADDITION 228 | // add all component differences and saturate 229 | __m256i diff_shift1 = _mm256_srli_epi32(diff, 8); // 2nd component 230 | diff = _mm256_adds_epu8(diff, diff_shift1); 231 | diff_shift1 = _mm256_srli_epi32(diff_shift1, 8); // 3rd component 232 | diff = _mm256_adds_epu8(diff, diff_shift1); 233 | diff = _mm256_and_si256(diff, byte_mask); // zero out all but 1st byte 234 | #endif 235 | 236 | _mm256_store_si256(diff8x, diff); 237 | } 238 | 239 | void CRBFilterAVX2::horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch) 240 | { 241 | // force height segments to be even cause this filter processes 2 lines at a time 242 | int height_segment = (height / m_thread_count) & (~1); 243 | int buffer_offset = thread_index * height_segment * pitch; 244 | img_src += buffer_offset; 245 | img_dst += buffer_offset; 246 | 247 | int width32 = pitch / 32; 248 | 249 | // last segment should account for uneven height 250 | // since reserve buffer height is rounded up to even number, it's OK if source is uneven 251 | // but that assumes hozitonal filter output buffer is the reservered buffer, or that destination is rounded up to even number 252 | if (thread_index + 1 == m_thread_count) 253 | height_segment = height - thread_index * height_segment; 254 | 255 | // float* alpha_cache_start = m_alpha_cache[thread_index]; 256 | // cache line structure: 257 | // 4 floats of alpha_f from line 1 258 | // 4 floats of alpha_f from line 2 259 | // 4 floats of source color premultiplied with 'm_inv_alpha_f' from line 1 260 | // 4 floats of source color premultiplied with 'm_inv_alpha_f' from line 2 261 | // 4 floats of 1st pass result color from line 1 262 | // 4 floats of 1st pass result color from line 2 263 | float* line_cache = m_h_line_cache[thread_index]; 264 | const float* range_table = m_range_table; 265 | 266 | __declspec(align(32)) long color_diff[16]; 267 | 268 | _mm256_zeroall(); 269 | 270 | __m256i mask_unpack = _mm256_setr_epi8(12, -1, -1, -1, // pixel 1 R 271 | 13, -1, -1, -1, // pixel 1 G 272 | 14, -1, -1, -1, // pixel 1 B 273 | 15, -1, -1, -1, // pixel 1 A 274 | 12, -1, -1, -1, // pixel 2 R 275 | 13, -1, -1, -1, // pixel 2 G 276 | 14, -1, -1, -1, // pixel 2 B 277 | 15, -1, -1, -1);// pixel 2 A 278 | 279 | __m256i mask_pack = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, // pixel 1 280 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); // pixel 2 281 | 282 | __m256 inv_alpha = _mm256_set1_ps(m_inv_alpha_f); 283 | 284 | // process 2 horizontal lines at a time 285 | for (int y = 0; y < height_segment; y+= 2) 286 | { 287 | __m256 alpha_prev = _mm256_set1_ps(1.f); 288 | __m256 color_prev; 289 | 290 | 291 | float* line_buffer = line_cache + 24 * pitch / 4; 292 | // 1st line 293 | int buffer_inc = (y + 1) * pitch - 32; 294 | const __m256i* src1_8xCur = (const __m256i*)(img_src + buffer_inc); 295 | const __m256i* src1_8xPrev = (const __m256i*)(img_src + buffer_inc + 4); 296 | // 2nd line 297 | buffer_inc += pitch; 298 | const __m256i* src2_8xCur = (const __m256i*)(img_src + buffer_inc); 299 | const __m256i* src2_8xPrev = (const __m256i*)(img_src + buffer_inc + 4); 300 | 301 | 302 | ///////////////////////////// 303 | // right to left pass 304 | for (int x = 0; x < width32; x++) 305 | { 306 | __m256i pix8_1 = _mm256_load_si256(src1_8xCur--); 307 | __m256i pix8p_1 = _mm256_loadu_si256(src1_8xPrev--); 308 | getDiffFactor3x(pix8_1, pix8p_1, (__m256i*)color_diff); 309 | 310 | __m256i pix8_2 = _mm256_load_si256(src2_8xCur--); 311 | __m256i pix8p_2 = _mm256_loadu_si256(src2_8xPrev--); 312 | getDiffFactor3x(pix8_2, pix8p_2, (__m256i*)(color_diff + 8)); 313 | 314 | // last 4 pixels of 2 lines 315 | __m256i pix8 = _mm256_permute2f128_si256(pix8_1, pix8_2, 1 | (3 << 4)); 316 | 317 | //////////////////// 318 | // pixel 1 unpack 319 | { 320 | // alpha factor 321 | float alpha2_f = range_table[color_diff[7]]; 322 | float alpha1_f = range_table[color_diff[7 + 8]]; 323 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 324 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 325 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 326 | 327 | // source pixel 328 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 329 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 330 | if (x == 0) // have to initialize prev_color with last pixel color, this condition has no noticeable penalty 331 | color_prev = pix2f; 332 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 333 | _mm256_store_ps(line_buffer + 8, pix2f); 334 | 335 | // filter 336 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 337 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 338 | 339 | // final color 340 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 341 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 342 | line_buffer -= 24; 343 | } 344 | 345 | //////////////////// 346 | // pixel 2 unpack 347 | { 348 | // alpha factor 349 | float alpha2_f = range_table[color_diff[6]]; 350 | float alpha1_f = range_table[color_diff[6 + 8]]; 351 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 352 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 353 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 354 | 355 | // source pixel 356 | pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel 357 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 358 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 359 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 360 | _mm256_store_ps(line_buffer + 8, pix2f); 361 | 362 | // filter 363 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 364 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 365 | 366 | // final color 367 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 368 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 369 | line_buffer -= 24; 370 | } 371 | 372 | 373 | //////////////////// 374 | // pixel 3 unpack 375 | { 376 | // alpha factor 377 | float alpha2_f = range_table[color_diff[5]]; 378 | float alpha1_f = range_table[color_diff[5 + 8]]; 379 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 380 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 381 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 382 | 383 | // source pixel 384 | pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel 385 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 386 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 387 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 388 | _mm256_store_ps(line_buffer + 8, pix2f); 389 | 390 | // filter 391 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 392 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 393 | 394 | // final color 395 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 396 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 397 | line_buffer -= 24; 398 | } 399 | 400 | //////////////////// 401 | // pixel 4 unpack 402 | { 403 | // alpha factor 404 | float alpha2_f = range_table[color_diff[4]]; 405 | float alpha1_f = range_table[color_diff[4 + 8]]; 406 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 407 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 408 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 409 | 410 | // source pixel 411 | pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel 412 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 413 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 414 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 415 | _mm256_store_ps(line_buffer + 8, pix2f); 416 | 417 | // filter 418 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 419 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 420 | 421 | // final color 422 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 423 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 424 | line_buffer -= 24; 425 | } 426 | 427 | // next 4 pixels of 2 lines 428 | pix8 = _mm256_permute2f128_si256(pix8_1, pix8_2, 2 << 4); 429 | 430 | 431 | //////////////////// 432 | // pixel 5 unpack 433 | { 434 | // alpha factor 435 | float alpha2_f = range_table[color_diff[3]]; 436 | float alpha1_f = range_table[color_diff[3 + 8]]; 437 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 438 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 439 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 440 | 441 | // source pixel 442 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 443 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 444 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 445 | _mm256_store_ps(line_buffer + 8, pix2f); 446 | 447 | // filter 448 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 449 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 450 | 451 | // final color 452 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 453 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 454 | line_buffer -= 24; 455 | } 456 | 457 | 458 | //////////////////// 459 | // pixel 6 unpack 460 | { 461 | // alpha factor 462 | float alpha2_f = range_table[color_diff[2]]; 463 | float alpha1_f = range_table[color_diff[2 + 8]]; 464 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 465 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 466 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 467 | 468 | // source pixel 469 | pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel 470 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 471 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 472 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 473 | _mm256_store_ps(line_buffer + 8, pix2f); 474 | 475 | // filter 476 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 477 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 478 | 479 | // final color 480 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 481 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 482 | line_buffer -= 24; 483 | } 484 | 485 | 486 | //////////////////// 487 | // pixel 7 unpack 488 | { 489 | // alpha factor 490 | float alpha2_f = range_table[color_diff[1]]; 491 | float alpha1_f = range_table[color_diff[1 + 8]]; 492 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 493 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 494 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 495 | 496 | // source pixel 497 | pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel 498 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 499 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 500 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 501 | _mm256_store_ps(line_buffer + 8, pix2f); 502 | 503 | // filter 504 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 505 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 506 | 507 | // final color 508 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 509 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 510 | line_buffer -= 24; 511 | } 512 | 513 | 514 | //////////////////// 515 | // pixel 8 unpack 516 | { 517 | // alpha factor 518 | float alpha2_f = range_table[color_diff[0]]; 519 | float alpha1_f = range_table[color_diff[0 + 8]]; 520 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 521 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 522 | _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights 523 | 524 | // source pixel 525 | pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel 526 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD 527 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 528 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color 529 | _mm256_store_ps(line_buffer + 8, pix2f); 530 | 531 | // filter 532 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 533 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 534 | 535 | // final color 536 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 537 | _mm256_store_ps(line_buffer + 16, out_color); // cache final color 538 | line_buffer -= 24; 539 | } 540 | 541 | 542 | } 543 | 544 | ///////////////////////////// 545 | // left to right pass 546 | __m256i* dst1_pix8 = (__m256i*)(img_dst + y * pitch); 547 | __m256i* dst2_pix8 = (__m256i*)(img_dst + (y + 1) * pitch); 548 | 549 | for (int x = 0; x < width32; x++) 550 | { 551 | __m256i result1; 552 | __m256i result2; 553 | 554 | ///////////// 555 | // 1st 4 pixels 556 | // pixel 1 557 | { 558 | // alpha 559 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 560 | line_buffer += 24; 561 | 562 | // get pre-multiplied source color 563 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 564 | 565 | // first pixel in line needs to initialize color_prev to original source color 566 | if (x == 0) 567 | color_prev = _mm256_div_ps(pix2f, inv_alpha); // source color was premultiplied 568 | 569 | // filter 570 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 571 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 572 | 573 | // final color 574 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 575 | 576 | // get final color from previous pass 577 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 578 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 579 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 580 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 581 | 582 | // pack result 583 | result1 = _mm256_shuffle_epi8(pix2i, mask_pack); 584 | } 585 | 586 | 587 | // pixel 2 588 | { 589 | // alpha 590 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 591 | line_buffer += 24; 592 | 593 | // get pre-multiplied source color 594 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 595 | 596 | // filter 597 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 598 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 599 | 600 | // final color 601 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 602 | 603 | // get final color from previous pass 604 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 605 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 606 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 607 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 608 | 609 | // pack result 610 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 611 | result1 = _mm256_srli_si256(result1, 4); // shift 612 | result1 = _mm256_or_si256(result1, pix2i); // combine 613 | } 614 | 615 | // pixel 3 616 | { 617 | // alpha 618 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 619 | line_buffer += 24; 620 | 621 | // get pre-multiplied source color 622 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 623 | 624 | // filter 625 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 626 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 627 | 628 | // final color 629 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 630 | 631 | // get final color from previous pass 632 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 633 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 634 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 635 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 636 | 637 | // pack result 638 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 639 | result1 = _mm256_srli_si256(result1, 4); // shift 640 | result1 = _mm256_or_si256(result1, pix2i); // combine 641 | } 642 | 643 | // pixel 4 644 | { 645 | // alpha 646 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 647 | line_buffer += 24; 648 | 649 | // get pre-multiplied source color 650 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 651 | 652 | // filter 653 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 654 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 655 | 656 | // final color 657 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 658 | 659 | // get final color from previous pass 660 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 661 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 662 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 663 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 664 | 665 | // pack result 666 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 667 | result1 = _mm256_srli_si256(result1, 4); // shift 668 | result1 = _mm256_or_si256(result1, pix2i); // combine 669 | } 670 | 671 | // next 4 pixels packed in result2 672 | // pixel 5 673 | { 674 | // alpha 675 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 676 | line_buffer += 24; 677 | 678 | // get pre-multiplied source color 679 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 680 | 681 | // filter 682 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 683 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 684 | 685 | // final color 686 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 687 | 688 | // get final color from previous pass 689 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 690 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 691 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 692 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 693 | 694 | // pack result 695 | result2 = _mm256_shuffle_epi8(pix2i, mask_pack); 696 | } 697 | 698 | // pixel 6 699 | { 700 | // alpha 701 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 702 | line_buffer += 24; 703 | 704 | // get pre-multiplied source color 705 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 706 | 707 | // filter 708 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 709 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 710 | 711 | // final color 712 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 713 | 714 | // get final color from previous pass 715 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 716 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 717 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 718 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 719 | 720 | // pack result 721 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 722 | result2 = _mm256_srli_si256(result2, 4); // shift 723 | result2 = _mm256_or_si256(result2, pix2i); // combine 724 | } 725 | 726 | // pixel 7 727 | { 728 | // alpha 729 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 730 | line_buffer += 24; 731 | 732 | // get pre-multiplied source color 733 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 734 | 735 | // filter 736 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 737 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 738 | 739 | // final color 740 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 741 | 742 | // get final color from previous pass 743 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 744 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 745 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 746 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 747 | 748 | // pack result 749 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 750 | result2 = _mm256_srli_si256(result2, 4); // shift 751 | result2 = _mm256_or_si256(result2, pix2i); // combine 752 | } 753 | 754 | // pixel 8 755 | { 756 | // alpha 757 | __m256 alpha_f_8x = _mm256_load_ps(line_buffer); 758 | line_buffer += 24; 759 | 760 | // get pre-multiplied source color 761 | __m256 pix2f = _mm256_load_ps(line_buffer + 8); 762 | 763 | // filter 764 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 765 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 766 | 767 | // final color 768 | __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color 769 | 770 | // get final color from previous pass 771 | __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); 772 | out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color 773 | __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer 774 | pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 775 | 776 | // pack result 777 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 778 | result2 = _mm256_srli_si256(result2, 4); // shift 779 | result2 = _mm256_or_si256(result2, pix2i); // combine 780 | } 781 | 782 | // separate packed results into lines 783 | __m256i line1 = _mm256_permute2f128_si256(result1, result2, 2 << 4); 784 | __m256i line2 = _mm256_permute2f128_si256(result1, result2, 1 | (3 << 4)); 785 | 786 | // store result 787 | _mm256_store_si256(dst1_pix8++, line1); 788 | _mm256_store_si256(dst2_pix8++, line2); 789 | } 790 | } 791 | 792 | } 793 | 794 | 795 | void CRBFilterAVX2::verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch) 796 | { 797 | int width_segment = width / m_thread_count; 798 | // make sure width segments round to 32 byte boundary 799 | width_segment -= width_segment % 8; 800 | int start_offset = width_segment * thread_index; 801 | if (thread_index == m_thread_count - 1) // last one 802 | { 803 | width_segment = getOptimalPitch(width) / 4 - start_offset; 804 | } 805 | 806 | int width8 = width_segment / 8; 807 | 808 | // adjust img buffer starting positions 809 | img_src += start_offset * 4; 810 | img_dst += start_offset * 4; 811 | 812 | float* line_cache = m_v_line_cache[thread_index]; 813 | const float* range_table = m_range_table; 814 | 815 | _mm256_zeroall(); 816 | 817 | __m256 inv_alpha = _mm256_set1_ps(m_inv_alpha_f); 818 | 819 | __m256i mask_pack = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, // pixel 1 820 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); // pixel 2 821 | 822 | __m256i mask_unpack = _mm256_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, // pixel 1 823 | 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1); // pixel 2 824 | 825 | // used to store maximum difference between 2 pixels 826 | __declspec(align(32)) long color_diff[8]; 827 | 828 | ///////////////// 829 | // Bottom to top pass first 830 | { 831 | // last line processed separately since no previous 832 | { 833 | float* line_buffer = line_cache; 834 | __m256i* dst_buf = (__m256i*)(img_dst + (height - 1) * pitch); 835 | __m256i* src_8xCur = (__m256i*)(img_src + (height - 1) * pitch); 836 | 837 | __m256 one = _mm256_set1_ps(1.f); 838 | 839 | for (int x = 0; x < width8; x++) 840 | { 841 | __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel 842 | _mm256_store_si256(dst_buf++, pix8); // copy to destination 843 | 844 | for (int i = 0; i < 4; i++) 845 | { 846 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); 847 | pix8 = _mm256_srli_si256(pix8, 4); // shift right 848 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 849 | 850 | _mm256_store_ps(line_buffer, one); 851 | _mm256_store_ps(line_buffer + 8, pix2f); 852 | 853 | line_buffer += 16; 854 | } 855 | } 856 | } 857 | 858 | // process other lines 859 | for (int y = height - 2; y >= 0; y--) 860 | { 861 | float* line_buffer = line_cache; 862 | __m256i* dst_buf = (__m256i*)(img_dst + y * pitch); 863 | __m256i* src_8xCur = (__m256i*)(img_src + y * pitch); 864 | __m256i* src_8xPrev = (__m256i*)(img_src + (y + 1) * pitch); 865 | 866 | for (int x = 0; x < width8; x++) 867 | { 868 | __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel 869 | __m256i pix8p = _mm256_load_si256(src_8xPrev++); 870 | __m256i pix_out; // final 8x packed pixels 871 | 872 | // get color differences 873 | getDiffFactor3x(pix8, pix8p, (__m256i*)color_diff); 874 | 875 | //////////////////// 876 | // pixel 1, 5 unpack 877 | { 878 | // alpha factor 879 | float alpha2_f = range_table[color_diff[0]]; 880 | float alpha1_f = range_table[color_diff[4]]; 881 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 882 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 883 | 884 | // load previous line color factor 885 | __m256 alpha_prev = _mm256_load_ps(line_buffer); 886 | // load previous line color 887 | __m256 color_prev = _mm256_load_ps(line_buffer + 8); 888 | 889 | // unpack current source pixel 890 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); 891 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 892 | 893 | // filter 894 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); 895 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 896 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 897 | 898 | // store current factor and color as previous for next cycle 899 | _mm256_store_ps(line_buffer, alpha_prev); 900 | _mm256_store_ps(line_buffer + 8, color_prev); 901 | line_buffer += 16; 902 | 903 | // calculate final color 904 | pix2f = _mm256_div_ps(color_prev, alpha_prev); 905 | 906 | // pack float pixel into byte pixel 907 | pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer 908 | pix_out = _mm256_shuffle_epi8(pix2i, mask_pack); 909 | } 910 | 911 | // loop for other pixels 912 | for(int i=1; i<4; i++) 913 | { 914 | // alpha factor 915 | float alpha2_f = range_table[color_diff[i]]; 916 | float alpha1_f = range_table[color_diff[i+4]]; 917 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 918 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 919 | 920 | // load previous line color factor 921 | __m256 alpha_prev = _mm256_load_ps(line_buffer); 922 | // load previous line color 923 | __m256 color_prev = _mm256_load_ps(line_buffer + 8); 924 | 925 | // unpack current source pixel 926 | pix8 = _mm256_srli_si256(pix8, 4); // shift right 927 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); 928 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 929 | 930 | // filter 931 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); 932 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 933 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 934 | 935 | // store current factor and color as previous for next cycle 936 | _mm256_store_ps(line_buffer, alpha_prev); 937 | _mm256_store_ps(line_buffer + 8, color_prev); 938 | line_buffer += 16; 939 | 940 | // calculate final color 941 | pix2f = _mm256_div_ps(color_prev, alpha_prev); 942 | 943 | // pack float pixel into byte pixel 944 | pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer 945 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 946 | pix_out = _mm256_srli_si256(pix_out, 4); // shift 947 | pix_out = _mm256_or_si256(pix_out, pix2i); // combine 948 | } 949 | 950 | // store result 951 | _mm256_store_si256(dst_buf++, pix_out); 952 | } 953 | } 954 | } 955 | 956 | ///////////////// 957 | // Top to bottom pass last 958 | { 959 | 960 | // first line processed separately since no previous 961 | { 962 | float* line_buffer = line_cache; 963 | __m256i* dst_line = (__m256i*)img_dst; 964 | __m256i* src_8xCur = (__m256i*)img_src; 965 | 966 | __m256 one = _mm256_set1_ps(1.f); 967 | 968 | for (int x = 0; x < width8; x++) 969 | { 970 | __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel 971 | __m256i pix8_d = _mm256_load_si256(dst_line); 972 | pix8_d = _mm256_avg_epu8(pix8_d, pix8); // average out 973 | _mm256_store_si256(dst_line++, pix8_d); 974 | 975 | for (int i = 0; i < 4; i++) 976 | { 977 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); 978 | pix8 = _mm256_srli_si256(pix8, 4); // shift right 979 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 980 | 981 | _mm256_store_ps(line_buffer, one); 982 | _mm256_store_ps(line_buffer + 8, pix2f); 983 | 984 | line_buffer += 16; 985 | } 986 | } 987 | } 988 | 989 | // process other lines 990 | for (int y = 1; y < height; y++) 991 | { 992 | float* line_buffer = line_cache; 993 | __m256i* dst_buf = (__m256i*)(img_dst + y * pitch); 994 | __m256i* src_8xCur = (__m256i*)(img_src + y * pitch); 995 | __m256i* src_8xPrev = (__m256i*)(img_src + (y - 1) * pitch); 996 | 997 | for (int x = 0; x < width8; x++) 998 | { 999 | __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel 1000 | __m256i pix8p = _mm256_load_si256(src_8xPrev++); 1001 | __m256i pix_out; // final 8x packed pixels 1002 | 1003 | // get color differences 1004 | getDiffFactor3x(pix8, pix8p, (__m256i*)color_diff); 1005 | 1006 | //////////////////// 1007 | // pixel 1, 5 unpack 1008 | { 1009 | // alpha factor 1010 | float alpha2_f = range_table[color_diff[0]]; 1011 | float alpha1_f = range_table[color_diff[4]]; 1012 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 1013 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 1014 | 1015 | // load previous line color factor 1016 | __m256 alpha_prev = _mm256_load_ps(line_buffer); 1017 | // load previous line color 1018 | __m256 color_prev = _mm256_load_ps(line_buffer + 8); 1019 | 1020 | // unpack current source pixel 1021 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); 1022 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 1023 | 1024 | // filter 1025 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); 1026 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 1027 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 1028 | 1029 | // store current factor and color as previous for next cycle 1030 | _mm256_store_ps(line_buffer, alpha_prev); 1031 | _mm256_store_ps(line_buffer + 8, color_prev); 1032 | line_buffer += 16; 1033 | 1034 | // calculate final color 1035 | pix2f = _mm256_div_ps(color_prev, alpha_prev); 1036 | 1037 | // pack float pixel into byte pixel 1038 | pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer 1039 | pix_out = _mm256_shuffle_epi8(pix2i, mask_pack); 1040 | } 1041 | 1042 | // loop for other pixels 1043 | for (int i = 1; i<4; i++) 1044 | { 1045 | // alpha factor 1046 | float alpha2_f = range_table[color_diff[i]]; 1047 | float alpha1_f = range_table[color_diff[i + 4]]; 1048 | __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, 1049 | alpha2_f, alpha2_f, alpha2_f, alpha2_f); 1050 | 1051 | // load previous line color factor 1052 | __m256 alpha_prev = _mm256_load_ps(line_buffer); 1053 | // load previous line color 1054 | __m256 color_prev = _mm256_load_ps(line_buffer + 8); 1055 | 1056 | // unpack current source pixel 1057 | pix8 = _mm256_srli_si256(pix8, 4); // shift right 1058 | __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); 1059 | __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats 1060 | 1061 | // filter 1062 | pix2f = _mm256_mul_ps(pix2f, inv_alpha); 1063 | alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor 1064 | color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color 1065 | 1066 | // store current factor and color as previous for next cycle 1067 | _mm256_store_ps(line_buffer, alpha_prev); 1068 | _mm256_store_ps(line_buffer + 8, color_prev); 1069 | line_buffer += 16; 1070 | 1071 | // calculate final color 1072 | pix2f = _mm256_div_ps(color_prev, alpha_prev); 1073 | 1074 | // pack float pixel into byte pixel 1075 | pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer 1076 | pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); 1077 | pix_out = _mm256_srli_si256(pix_out, 4); // shift 1078 | pix_out = _mm256_or_si256(pix_out, pix2i); // combine 1079 | } 1080 | 1081 | // average result with previous values in destination buffer 1082 | __m256i pix8_d = _mm256_load_si256(dst_buf); 1083 | pix_out = _mm256_avg_epu8(pix8_d, pix_out); 1084 | _mm256_store_si256(dst_buf++, pix_out); 1085 | } 1086 | } 1087 | } 1088 | } 1089 | 1090 | bool CRBFilterAVX2::filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch) 1091 | { 1092 | // basic error checking 1093 | if (!m_stage_buffer[0]) 1094 | return false; 1095 | 1096 | if (width < 32 || width > m_reserved_width) 1097 | return false; 1098 | 1099 | if (height < 16 || height > m_reserved_height) 1100 | return false; 1101 | 1102 | if (pitch < width * 4) 1103 | return false; 1104 | 1105 | if (!out_data || !in_data) 1106 | return false; 1107 | 1108 | if (m_inv_alpha_f == 0.f) 1109 | return false; 1110 | 1111 | int thread_count_adjusted = m_thread_count - 1; 1112 | 1113 | ////////////////////////////////////////////// 1114 | // horizontal filter divided in threads 1115 | for (int i = 0; i < thread_count_adjusted; i++) 1116 | { 1117 | m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::horizontalFilter, this, i, in_data, m_stage_buffer[0], width, height, pitch); 1118 | } 1119 | 1120 | // use this thread for last segment 1121 | horizontalFilter(thread_count_adjusted, in_data, m_stage_buffer[0], width, height, pitch); 1122 | 1123 | // wait for result 1124 | for (int i = 0; i < thread_count_adjusted; i++) 1125 | { 1126 | m_horizontal_tasks[i].get(); 1127 | } 1128 | 1129 | ///////////////////////////////////////////// 1130 | // vertical filter divided in threads 1131 | for (int i = 0; i < thread_count_adjusted; i++) 1132 | { 1133 | m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::verticalFilter, this, i, m_stage_buffer[0], out_data, width, height, pitch); 1134 | } 1135 | 1136 | // use this thread for last segment 1137 | verticalFilter(thread_count_adjusted, m_stage_buffer[0], out_data, width, height, pitch); 1138 | 1139 | // wait for result 1140 | for (int i = 0; i < thread_count_adjusted; i++) 1141 | { 1142 | m_vertical_tasks[i].get(); 1143 | } 1144 | 1145 | return true; 1146 | } 1147 | 1148 | bool CRBFilterAVX2::filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch) 1149 | { 1150 | // basic error checking 1151 | if (!m_stage_buffer[0]) 1152 | return false; 1153 | 1154 | if (width < 16 || width > m_reserved_width) 1155 | return false; 1156 | 1157 | if (height < 16 || height > m_reserved_height) 1158 | return false; 1159 | 1160 | if (pitch < width * 4) 1161 | return false; 1162 | 1163 | if (m_inv_alpha_f == 0.f) 1164 | return false; 1165 | 1166 | m_image_width = width; 1167 | m_image_height = height; 1168 | m_image_pitch = pitch; 1169 | 1170 | // block until last frame finished 1st stage 1171 | for (int i = 0; i < m_thread_count; i++) 1172 | { 1173 | if(m_horizontal_tasks[i].valid()) 1174 | m_horizontal_tasks[i].get(); 1175 | } 1176 | 1177 | int previous_stage_index = (m_filter_counter - 1) % STAGE_BUFFER_COUNT; 1178 | int current_stage_index = m_filter_counter % STAGE_BUFFER_COUNT; 1179 | m_filter_counter++; 1180 | m_out_buffer[current_stage_index] = out_data; 1181 | 1182 | // start new horizontal stage 1183 | if (in_data) 1184 | { 1185 | // start first stage for current frame 1186 | for (int i = 0; i < m_thread_count; i++) 1187 | { 1188 | m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::horizontalFilter, this, i, in_data, m_stage_buffer[current_stage_index], width, height, pitch); 1189 | } 1190 | } 1191 | 1192 | // block until last frame finished 2nd stage 1193 | for (int i = 0; i < m_thread_count; i++) 1194 | { 1195 | if (m_vertical_tasks[i].valid()) 1196 | m_vertical_tasks[i].get(); 1197 | } 1198 | 1199 | // start new vertical stage based on result of previous stage 1200 | if (previous_stage_index >= 0 && m_out_buffer[previous_stage_index]) 1201 | { 1202 | // start first stage for current frame 1203 | for (int i = 0; i < m_thread_count; i++) 1204 | { 1205 | m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::verticalFilter, this, i, m_stage_buffer[previous_stage_index], m_out_buffer[previous_stage_index], width, height, pitch); 1206 | } 1207 | } 1208 | 1209 | return true; 1210 | } 1211 | 1212 | void CRBFilterAVX2::filterPipeFlush() 1213 | { 1214 | filterPipePush(nullptr, nullptr, m_image_width, m_image_height, m_image_pitch); 1215 | 1216 | if (m_filter_counter > 0) 1217 | { 1218 | for (int i = 0; i < m_thread_count; i++) 1219 | { 1220 | if(m_vertical_tasks[i].valid()) 1221 | m_vertical_tasks[i].get(); 1222 | } 1223 | } 1224 | 1225 | m_filter_counter = 0; 1226 | } --------------------------------------------------------------------------------