├── RBF_run64.png
├── RBF_chart32.png
├── RBF_chart64.png
├── RBF_chart64vs32.png
├── images
    ├── testGirl.jpg
    ├── Thefarmhouse.jpg
    └── testpatern5.png
├── stdafx.cpp
├── targetver.h
├── stdafx.h
├── LICENSE
├── RecursiveBilateralFilter.sln
├── RBFilterPlain.h
├── RBFilter_SSE2.h
├── RBFilter_AVX2.h
├── ReadMe.md
├── RecursiveBilateralFilter.vcxproj
├── RBFilterPlain.cpp
├── rbf.hpp
├── RecursiveBilateralFilter.cpp
├── RBFilter_SSE2.cpp
├── stb_image_write.h
└── RBFilter_AVX2.cpp


/RBF_run64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_run64.png


--------------------------------------------------------------------------------
/RBF_chart32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_chart32.png


--------------------------------------------------------------------------------
/RBF_chart64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_chart64.png


--------------------------------------------------------------------------------
/RBF_chart64vs32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/RBF_chart64vs32.png


--------------------------------------------------------------------------------
/images/testGirl.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/images/testGirl.jpg


--------------------------------------------------------------------------------
/images/Thefarmhouse.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/images/Thefarmhouse.jpg


--------------------------------------------------------------------------------
/images/testpatern5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fig1024/OP_RBF/HEAD/images/testpatern5.png


--------------------------------------------------------------------------------
/stdafx.cpp:
--------------------------------------------------------------------------------
1 | // stdafx.cpp : source file that includes just the standard includes
2 | // RecursiveBilateralFilter.pch will be the pre-compiled header
3 | // stdafx.obj will contain the pre-compiled type information
4 | 
5 | #include "stdafx.h"
6 | 
7 | // TODO: reference any additional headers you need in STDAFX.H
8 | // and not in this file
9 | 


--------------------------------------------------------------------------------
/targetver.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | // Including SDKDDKVer.h defines the highest available Windows platform.
4 | 
5 | // If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
6 | // set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
7 | 
8 | #include <SDKDDKVer.h>
9 | 


--------------------------------------------------------------------------------
/stdafx.h:
--------------------------------------------------------------------------------
 1 | // stdafx.h : include file for standard system include files,
 2 | // or project specific include files that are used frequently, but
 3 | // are changed infrequently
 4 | //
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "targetver.h"
 9 | 
10 | // these are needed for the image loader STB
11 | #define STB_IMAGE_IMPLEMENTATION
12 | #define STB_IMAGE_WRITE_IMPLEMENTATION
13 | #define _CRT_SECURE_NO_WARNINGS
14 | 
15 | #include <stdio.h>
16 | #include <tchar.h>
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Ming
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/RecursiveBilateralFilter.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 14
 4 | VisualStudioVersion = 14.0.25420.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "RecursiveBilateralFilter", "RecursiveBilateralFilter.vcxproj", "{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Debug|x86 = Debug|x86
12 | 		Release|x64 = Release|x64
13 | 		Release|x86 = Release|x86
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x64.Build.0 = Debug|x64
18 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x86.ActiveCfg = Debug|Win32
19 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Debug|x86.Build.0 = Debug|Win32
20 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x64.ActiveCfg = Release|x64
21 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x64.Build.0 = Release|x64
22 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x86.ActiveCfg = Release|Win32
23 | 		{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}.Release|x86.Build.0 = Release|Win32
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | EndGlobal
29 | 


--------------------------------------------------------------------------------
/RBFilterPlain.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | 
 4 | // This class is useful only for the sake of understanding the main principles of Recursive Bilateral Filter
 5 | // It is designed in non-optimal but easy to understand way. It also does not match 1:1 with original, 
 6 | // some creative liberties were taken with original idea.
 7 | // This class is not used in performance tests
 8 | 
 9 | class CRBFilterPlain
10 | {
11 | 	int			m_reserve_width = 0;
12 | 	int			m_reserve_height = 0;
13 | 	int			m_reserve_channels = 0;
14 | 
15 | 	float*		m_left_pass_color = nullptr;
16 | 	float*		m_left_pass_factor = nullptr;
17 | 
18 | 	float*		m_right_pass_color = nullptr;
19 | 	float*		m_right_pass_factor = nullptr;
20 | 
21 | 	float*		m_down_pass_color = nullptr;
22 | 	float*		m_down_pass_factor = nullptr;
23 | 
24 | 	float*		m_up_pass_color = nullptr;
25 | 	float*		m_up_pass_factor = nullptr;
26 | 
27 | 	int getDiffFactor(const unsigned char* color1, const unsigned char* color2) const;
28 | 
29 | public:
30 | 
31 | 	CRBFilterPlain();
32 | 	~CRBFilterPlain();
33 | 
34 | 	// assumes 3/4 channel images, 1 byte per channel
35 | 	void reserveMemory(int max_width, int max_height, int channels);
36 | 	void releaseMemory();
37 | 
38 | 	// memory must be reserved before calling image filter
39 | 	// this implementation of filter uses plain C++, single threaded
40 | 	// channel count must be 3 or 4 (alpha not used)
41 | 	void filter(unsigned char* img_src, unsigned char* img_dst,
42 | 		float sigma_spatial, float sigma_range,
43 | 		int width, int height, int channel);
44 | };


--------------------------------------------------------------------------------
/RBFilter_SSE2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // Optimized SSE2 implementation of Recursive Bilateral Filter
 4 | // 
 5 | 
 6 | #include <future>
 7 | 
 8 | #define RBF_MAX_THREADS 8
 9 | #define STAGE_BUFFER_COUNT 3
10 | 
11 | class CRBFilterSSE2
12 | {
13 | 	int				m_reserved_width = 0;
14 | 	int				m_reserved_height = 0;
15 | 	int				m_thread_count = 0;
16 | 	bool			m_pipelined = false;
17 | 
18 | 	float			m_sigma_spatial = 0.f;
19 | 	float			m_sigma_range = 0.f;
20 | 	float			m_inv_alpha_f = 0.f;
21 | 	float*			m_range_table = nullptr;
22 | 
23 | 	int				m_filter_counter = 0; // used in pipelined mode
24 | 	unsigned char*	m_stage_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // size width * height * 4, 2nd one null if not pipelined
25 | 	float**			m_h_line_cache = nullptr; // single line cache for horizontal filter pass, one per thread
26 | 	float**			m_v_line_cache = nullptr; // if not pipelined mode, this is equal to 'm_h_line_cache'
27 | 	unsigned char*	m_out_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // used for keeping track of current output buffer in pipelined mode 
28 | 	int				m_image_width = 0; // cache of sizes for pipelined mode
29 | 	int				m_image_height = 0;
30 | 	int				m_image_pitch = 0;
31 | 
32 | 	std::future<void> m_horizontal_tasks[RBF_MAX_THREADS];
33 | 	std::future<void> m_vertical_tasks[RBF_MAX_THREADS];
34 | 
35 | 	// core filter functions
36 | 	void horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch);
37 | 	void verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch);
38 | 
39 | public:
40 | 
41 | 	CRBFilterSSE2();
42 | 	~CRBFilterSSE2();
43 | 
44 | 	// 'sigma_spatial' - unlike the original implementation of Recursive Bilateral Filter, 
45 | 	// the value if sigma_spatial is not influence by image width/height.
46 | 	// In this implementation, sigma_spatial is assumed over image width 255, height 255
47 | 	void setSigma(float sigma_spatial, float sigma_range);
48 | 
49 | 	// Source and destination images are assumed to be 4 component
50 | 	// 'width' - maximum image width
51 | 	// 'height' - maximum image height
52 | 	// 'thread_count' - total thread count to use for each filter stage (horizontal and vertical), recommended thread count = 4
53 | 	// 'pipelined' - if true, then horizontal and vertical filter passes are split into separate stages,
54 | 	// where each stage uses 'thread_count' of threads (so basically double)
55 | 	// Return true if successful, had very basic error checking
56 | 	bool initialize(int width, int height, int thread_count = 1, bool pipelined = false);
57 | 	
58 | 	// de-initialize, free memory
59 | 	void release();
60 | 
61 | 	// synchronous filter function, returns only when everything finished, goes faster if there's multiple threads
62 | 	// initialize() and setSigma() should be called before this
63 | 	// 'out_data' - output image buffer, assumes 4 byte per pixel
64 | 	// 'in_data' - input image buffer, assumes 4 byte per pixel
65 | 	// 'width' - width of both input and output buffers, must be same for both
66 | 	// 'height' - height of both input and output buffers, must be same for both
67 | 	// 'pitch' - row size in bytes, must be same for both buffers (ideally, this should be divisible by 16)
68 | 	// return false if failed for some reason
69 | 	bool filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch);
70 | 
71 | 	// asynchronous, pipelined filter function
72 | 	// pipeline consists of 2 stages, one for horizontal filter, other for vertical filter
73 | 	// this is useful for video filtering where 1-2 frame delay is acceptable
74 | 	// for simplicity of this sample implementation, input and output data buffers must remain valid until filtering is finished
75 | 	// since it's 2 stage pipeline, consecutive calls should submit alternating buffers (2 sets of input and output buffers)
76 | 	// This function blocks until 1st stage finishes from previous call
77 | 	bool filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch);
78 | 	// this function blocks until both stages finished all processing
79 | 	// it should always be used to get last frame
80 | 	void filterPipeFlush();
81 | };


--------------------------------------------------------------------------------
/RBFilter_AVX2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // Optimized SSE2 implementation of Recursive Bilateral Filter
 4 | // 
 5 | 
 6 | #include <future>
 7 | 
 8 | #define RBF_MAX_THREADS 8
 9 | #define STAGE_BUFFER_COUNT 3
10 | 
11 | class CRBFilterAVX2
12 | {
13 | 	int				m_reserved_width = 0;
14 | 	int				m_reserved_height = 0;
15 | 	int				m_thread_count = 0;
16 | 	bool			m_pipelined = false;
17 | 
18 | 	float			m_sigma_spatial = 0.f;
19 | 	float			m_sigma_range = 0.f;
20 | 	float			m_inv_alpha_f = 0.f;
21 | 	float*			m_range_table = nullptr;
22 | 
23 | 	int				m_filter_counter = 0; // used in pipelined mode
24 | 	unsigned char*	m_stage_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // size width * height * 4, others are null if not pipelined
25 | 	float**			m_h_line_cache = nullptr; // line cache for horizontal filter pass, 1 per thread
26 | 	float**			m_v_line_cache = nullptr; // line cache for vertical filter pass, 1 per thread
27 | 	unsigned char*	m_out_buffer[STAGE_BUFFER_COUNT] = { nullptr }; // used for keeping track of current output buffer in pipelined mode 
28 | 	int				m_image_width = 0; // cache of sizes for pipelined mode
29 | 	int				m_image_height = 0;
30 | 	int				m_image_pitch = 0;
31 | 
32 | 	std::future<void> m_horizontal_tasks[RBF_MAX_THREADS];
33 | 	std::future<void> m_vertical_tasks[RBF_MAX_THREADS];
34 | 
35 | 	// core filter functions
36 | 	void horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch);
37 | 	void verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch);
38 | 
39 | public:
40 | 
41 | 	CRBFilterAVX2();
42 | 	~CRBFilterAVX2();
43 | 
44 | 	// given specified image width, return optimal row size in bytes that has been rounded up to better fit YMM registers
45 | 	// image buffers should use this pitch for input and output
46 | 	int getOptimalPitch(int width) const; 
47 | 
48 | 	// 'sigma_spatial' - unlike the original implementation of Recursive Bilateral Filter, 
49 | 	// the value if sigma_spatial is not influence by image width/height.
50 | 	// In this implementation, sigma_spatial is assumed over image width 255, height 255
51 | 	void setSigma(float sigma_spatial, float sigma_range);
52 | 
53 | 	// Source and destination images are assumed to be 4 component
54 | 	// 'width' - maximum image width
55 | 	// 'height' - maximum image height
56 | 	// 'thread_count' - total thread count to use for each filter stage (horizontal and vertical), recommended thread count = 4
57 | 	// 'pipelined' - if true, then horizontal and vertical filter passes are split into separate stages,
58 | 	// where each stage uses 'thread_count' of threads (so basically double)
59 | 	// Return true if successful, had very basic error checking
60 | 	bool initialize(int width, int height, int thread_count = 1, bool pipelined = false);
61 | 	
62 | 	// de-initialize, free memory
63 | 	void release();
64 | 
65 | 	// synchronous filter function, returns only when everything finished, goes faster if there's multiple threads
66 | 	// initialize() and setSigma() should be called before this
67 | 	// 'out_data' - output image buffer, assumes 4 byte per pixel
68 | 	// 'in_data' - input image buffer, assumes 4 byte per pixel
69 | 	// 'width' - width of both input and output buffers, must be same for both
70 | 	// 'height' - height of both input and output buffers, must be same for both
71 | 	// 'pitch' - row size in bytes, must be same for both buffers (ideally, this should be divisible by 16)
72 | 	// return false if failed for some reason
73 | 	bool filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch);
74 | 
75 | 	// asynchronous, pipelined filter function
76 | 	// pipeline consists of 2 stages, one for horizontal filter, other for vertical filter
77 | 	// this is useful for video filtering where 1-2 frame delay is acceptable
78 | 	// for simplicity of this sample implementation, input and output data buffers must remain valid until filtering is finished
79 | 	// since it's 2 stage pipeline, consecutive calls should submit alternating buffers (2 sets of input and output buffers)
80 | 	// This function blocks until 1st stage finishes from previous call
81 | 	bool filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch);
82 | 	// this function blocks until both stages finished all processing
83 | 	// it should always be used to get last frame
84 | 	void filterPipeFlush();
85 | };


--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
 1 | # Optimized Recursive Bilateral Filter
 2 | 
 3 | This project is a derivative work based on this project:
 4 | https://github.com/ufoym/RecursiveBF
 5 | 
 6 | The main purpose of this project is to provide a more optimized implementation of the edge preserving Recursive Bilateral Filter. For more information about the image filter, see the link above
 7 | 
 8 | This project was made with VS2015 on Windows platform, but it should be easy to port if necessary. There aren't many files and I don't think any Windows specific functions were used.
 9 | 
10 | Optimization is based on 3 categories: reducing memory usage, adding multithreading, adding SSE2 / AVX2 C++ intrinsics
11 | 
12 | * Memory usage: in original implementation, memory usage of RGB32 or RGBA image would be roughtly = width * height * 40 + width * 40. In optimized implemention, it is roughly = width * height * 4 + width * 80 for non-piplined version. And width * height * 12 + width * 80 for pipelined. In general, almost 10x less memory allocation
13 | 
14 | * Multithreading: original implementation is written as single threaded solution, and in a way that it not easy split into threads. Optimized solution is multithread friendly because it separates the filter into 2 stages, one for horizontal filter pass, other for vertical filter pass. Each filter pass can then be subdivided into user chosen number of threads. For horizontal filter, each thread handles its own row from original data buffer, while for vertical pass, each thread handles its own column block
15 | 
16 | * SSE2 and AVX2: original implementation is written in basic C++ and while it is possible to select SSE2 or AVX2 optimization guidelines in compiler, the generated code does not properly take advantage of that functionality. Optimized solution provides 2 separate implementations, one written almost exclusively with SSE2 intrinsics, another almost exclusively with AVX2 intrinsics, so the compiler can utilize their capabilities much more effectively.
17 | 
18 | It's important to mention that this optimized implementation has some fundamental differences with the original. Those are:
19 | 
20 | * Only images with 4 bytes per pixel are accepted, this means RGB32 or RGBA. With some light modifications, it would be possible to adapt it to work with RGB24, single channel, or YUV 422 (2 pixels in 4 bytes)
21 | 
22 | * Edge detection algorithm is different. In original version, 3 components (RBG) of 2 adjacent pixels are evaluated for absolute difference, then 2 of those absolute differences are divided by 4 and added together, 3rd component is divided by 2 and added to the sum. The goal is to get absolute difference between 2 pixels in 0-255 range, but this solution makes one of the components have 2x significance of other 2. Optimized solution offers 2 alternative options (chosen which compiler flag): either get maximum of absolute differences between 3 components (stronger blur) or get 255 saturated sum of absolute differences of 3 components (weaker blur). Both methods have equal cost. The value of 4th component (alpha) is not taken into account, but it would be easy to do so if needed
23 | 
24 | * Sigma Spatial: in original implementation, sigma spatial, one of the 2 blur parameters, has depedency on image width and height. That means the same value would yield different amount of blur based on size of the image. Optimized solution removes that dependency by anchoring sigma spatiel to arbitrary value of 255, making it uniform for both width and height.
25 | 
26 | For testing purposes, 3 images were chosen: 
27 | 
28 | * testGirl.jpg - smallest image, 448 x 626, it is the same image used in original implementation 
29 | * Thefarmhouse.jpg - larger image, 1440 x 1080, it is a painting with lots of small noise that can be blurred. 
30 | * testpatern5.png - full HD image, 1920 x 1080, it is a test pattern that has no noise and sharp edges. It is useful for purpose of verifying that edge preserving image filter has minimum impact on the edges
31 | 
32 | Here's what built 64 bit application output looks like on Intel i7-4700HQ (~2.4 GHz):
33 | 
34 | ![alt text](./RBF_run64.png "64 bit application")
35 | 
36 | Image paths and blur strength (sigma) values are hardcoded, at top of RecursiveBilateralFilter.cpp
37 | When application runs, it saves filtered image under generated name in same folder as original images
38 | 
39 | Here is the same data in chart form, so it's easier to understand (time is in ms):
40 | 
41 | ![alt text](./RBF_chart64.png "64 bit chart")
42 | 
43 | It's interesting to note that the same application compiled as 32 bit performs significantly slower, especially for the original function
44 | 
45 | ![alt text](./RBF_chart32.png "32 bit chart")
46 | 
47 | Here's direct comparison of 64 bit vs 32 bit for full HD image
48 | 
49 | ![alt text](./RBF_chart64vs32.png "64 vs 32 bit chart")
50 | 
51 | Optimized solution provides 2 filter functions, one is designed for synchronous use - when multithreading is enabled, the function splits its work among threads and waits until they finish. Other filter function is asynchronous "push pipeline" mode, it divides task in 2 stages, horizontal filter pass and vertical filter pass. When horizontal pass is finished, it can start on next image while vertical pass starts on results of horizontal pass. 
52 | Further optimizations with multithreading are possible, current implementation is provided as simple example.
53 | 
54 | Most of the focus of this project is on utilization of XMM and YMM registers with SSE2 and AVX2 intrinsic functions. From the charts above, it is clear that even single threaded solution offers considerable speed up over original. It's also interesting to note that additional multithreading has diminishing returns, especially for small images.
55 | 
56 | SSE2 based filter solution was implemented to work with unaligned image buffers, while AVX2 requires input and output buffers to follow 32 byte alignment. It is possible to remove or relax that requirement with minor modications, there is not a significant penalty of working with unaligned memory for read operations, but write operations would need a few extra instructions and generally make for messier code
57 | 
58 | This project also provides a simple unoptimized C++ implementation of the Recursive Bilateral Filter in files RBFilterPlain.h, RBFilterPlain.cpp. This implementation does not participate in tests and it is only useful for the purposes of helping to understand the core of the algorithm. It's also useful for tinkering with filter design
59 | 
60 | In conclusion, the most optimized implementation of Recursive Bilateral Filter is able to achieve roughtly 10x speed up over original (slightly less)
61 | 
62 | It is even possible to process full HD video at 60 fps, with some room to spare on CPU (tho not much). For video processing, it would be best to add YUV 420 support, which is somewhat more involved due to its planar format.
63 | 
64 | 


--------------------------------------------------------------------------------
/RecursiveBilateralFilter.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <PropertyGroup Label="Globals">
 22 |     <ProjectGuid>{B003F67D-4A02-47C4-B0EC-E1A7BDC62663}</ProjectGuid>
 23 |     <Keyword>Win32Proj</Keyword>
 24 |     <RootNamespace>RecursiveBilateralFilter</RootNamespace>
 25 |     <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
 26 |   </PropertyGroup>
 27 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 28 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 29 |     <ConfigurationType>Application</ConfigurationType>
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |     <PlatformToolset>v140</PlatformToolset>
 32 |     <CharacterSet>Unicode</CharacterSet>
 33 |   </PropertyGroup>
 34 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 35 |     <ConfigurationType>Application</ConfigurationType>
 36 |     <UseDebugLibraries>false</UseDebugLibraries>
 37 |     <PlatformToolset>v140</PlatformToolset>
 38 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 39 |     <CharacterSet>Unicode</CharacterSet>
 40 |   </PropertyGroup>
 41 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 42 |     <ConfigurationType>Application</ConfigurationType>
 43 |     <UseDebugLibraries>true</UseDebugLibraries>
 44 |     <PlatformToolset>v140</PlatformToolset>
 45 |     <CharacterSet>Unicode</CharacterSet>
 46 |   </PropertyGroup>
 47 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 48 |     <ConfigurationType>Application</ConfigurationType>
 49 |     <UseDebugLibraries>false</UseDebugLibraries>
 50 |     <PlatformToolset>v140</PlatformToolset>
 51 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 52 |     <CharacterSet>Unicode</CharacterSet>
 53 |   </PropertyGroup>
 54 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 55 |   <ImportGroup Label="ExtensionSettings">
 56 |   </ImportGroup>
 57 |   <ImportGroup Label="Shared">
 58 |   </ImportGroup>
 59 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 60 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 61 |   </ImportGroup>
 62 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 63 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 64 |   </ImportGroup>
 65 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 66 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 67 |   </ImportGroup>
 68 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 69 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 70 |   </ImportGroup>
 71 |   <PropertyGroup Label="UserMacros" />
 72 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 73 |     <LinkIncremental>true</LinkIncremental>
 74 |   </PropertyGroup>
 75 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 76 |     <LinkIncremental>true</LinkIncremental>
 77 |   </PropertyGroup>
 78 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 79 |     <LinkIncremental>false</LinkIncremental>
 80 |   </PropertyGroup>
 81 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 82 |     <LinkIncremental>false</LinkIncremental>
 83 |   </PropertyGroup>
 84 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 85 |     <ClCompile>
 86 |       <PrecompiledHeader>Use</PrecompiledHeader>
 87 |       <WarningLevel>Level3</WarningLevel>
 88 |       <Optimization>Disabled</Optimization>
 89 |       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 90 |       <SDLCheck>true</SDLCheck>
 91 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 92 |     </ClCompile>
 93 |     <Link>
 94 |       <SubSystem>Console</SubSystem>
 95 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 96 |     </Link>
 97 |   </ItemDefinitionGroup>
 98 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 99 |     <ClCompile>
100 |       <PrecompiledHeader>Use</PrecompiledHeader>
101 |       <WarningLevel>Level3</WarningLevel>
102 |       <Optimization>Disabled</Optimization>
103 |       <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
104 |       <SDLCheck>true</SDLCheck>
105 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
106 |     </ClCompile>
107 |     <Link>
108 |       <SubSystem>Console</SubSystem>
109 |       <GenerateDebugInformation>true</GenerateDebugInformation>
110 |     </Link>
111 |   </ItemDefinitionGroup>
112 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
113 |     <ClCompile>
114 |       <WarningLevel>Level3</WarningLevel>
115 |       <PrecompiledHeader>Use</PrecompiledHeader>
116 |       <Optimization>MaxSpeed</Optimization>
117 |       <FunctionLevelLinking>true</FunctionLevelLinking>
118 |       <IntrinsicFunctions>true</IntrinsicFunctions>
119 |       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
120 |       <SDLCheck>true</SDLCheck>
121 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
122 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
123 |     </ClCompile>
124 |     <Link>
125 |       <SubSystem>Console</SubSystem>
126 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
127 |       <OptimizeReferences>true</OptimizeReferences>
128 |       <GenerateDebugInformation>true</GenerateDebugInformation>
129 |     </Link>
130 |   </ItemDefinitionGroup>
131 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
132 |     <ClCompile>
133 |       <WarningLevel>Level3</WarningLevel>
134 |       <PrecompiledHeader>Use</PrecompiledHeader>
135 |       <Optimization>MaxSpeed</Optimization>
136 |       <FunctionLevelLinking>true</FunctionLevelLinking>
137 |       <IntrinsicFunctions>true</IntrinsicFunctions>
138 |       <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
139 |       <SDLCheck>true</SDLCheck>
140 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
141 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
142 |     </ClCompile>
143 |     <Link>
144 |       <SubSystem>Console</SubSystem>
145 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
146 |       <OptimizeReferences>true</OptimizeReferences>
147 |       <GenerateDebugInformation>true</GenerateDebugInformation>
148 |     </Link>
149 |   </ItemDefinitionGroup>
150 |   <ItemGroup>
151 |     <Text Include="ReadMe.txt" />
152 |   </ItemGroup>
153 |   <ItemGroup>
154 |     <ClInclude Include="rbf.hpp" />
155 |     <ClInclude Include="RBFilterPlain.h" />
156 |     <ClInclude Include="RBFilter_AVX2.h" />
157 |     <ClInclude Include="RBFilter_SSE2.h" />
158 |     <ClInclude Include="stb_image.h" />
159 |     <ClInclude Include="stb_image_write.h" />
160 |     <ClInclude Include="stdafx.h" />
161 |     <ClInclude Include="targetver.h" />
162 |   </ItemGroup>
163 |   <ItemGroup>
164 |     <ClCompile Include="RBFilterPlain.cpp" />
165 |     <ClCompile Include="RBFilter_AVX2.cpp" />
166 |     <ClCompile Include="RBFilter_SSE2.cpp" />
167 |     <ClCompile Include="RecursiveBilateralFilter.cpp" />
168 |     <ClCompile Include="stdafx.cpp">
169 |       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
170 |       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
171 |       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
172 |       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
173 |     </ClCompile>
174 |   </ItemGroup>
175 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
176 |   <ImportGroup Label="ExtensionTargets">
177 |   </ImportGroup>
178 | </Project>


--------------------------------------------------------------------------------
/RBFilterPlain.cpp:
--------------------------------------------------------------------------------
  1 | #include "stdafx.h"
  2 | #include "RBFilterPlain.h"
  3 | #include "stdafx.h"
  4 | #include "RBFilterPlain.h"
  5 | #include <algorithm>
  6 | 
  7 | using namespace std;
  8 | 
  9 | #define QX_DEF_CHAR_MAX 255
 10 | 
 11 | 
 12 | CRBFilterPlain::CRBFilterPlain()
 13 | {
 14 | 
 15 | }
 16 | 
 17 | CRBFilterPlain::~CRBFilterPlain()
 18 | {
 19 | 	releaseMemory();
 20 | }
 21 | 
 22 | // assumes 3/4 channel images, 1 byte per channel
 23 | void CRBFilterPlain::reserveMemory(int max_width, int max_height, int channels)
 24 | {
 25 | 	// basic sanity check
 26 | 	_ASSERT(max_width >= 10 && max_width < 10000);
 27 | 	_ASSERT(max_height >= 10 && max_height < 10000);
 28 | 	_ASSERT(channels >= 1 && channels <= 4);
 29 | 
 30 | 	releaseMemory();
 31 | 
 32 | 	m_reserve_width = max_width;
 33 | 	m_reserve_height = max_height;
 34 | 	m_reserve_channels = channels;
 35 | 
 36 | 	int width_height = m_reserve_width * m_reserve_height;
 37 | 	int width_height_channel = width_height * m_reserve_channels;
 38 | 
 39 | 	m_left_pass_color = new float[width_height_channel];
 40 | 	m_left_pass_factor = new float[width_height];
 41 | 
 42 | 	m_right_pass_color = new float[width_height_channel];
 43 | 	m_right_pass_factor = new float[width_height];
 44 | 
 45 | 	m_down_pass_color = new float[width_height_channel];
 46 | 	m_down_pass_factor = new float[width_height];
 47 | 
 48 | 	m_up_pass_color = new float[width_height_channel];
 49 | 	m_up_pass_factor = new float[width_height];
 50 | }
 51 | 
 52 | void CRBFilterPlain::releaseMemory()
 53 | {
 54 | 	m_reserve_width = 0;
 55 | 	m_reserve_height = 0;
 56 | 	m_reserve_channels = 0;
 57 | 
 58 | 	if (m_left_pass_color)
 59 | 	{
 60 | 		delete[] m_left_pass_color;
 61 | 		m_left_pass_color = nullptr;
 62 | 	}
 63 | 
 64 | 	if (m_left_pass_factor)
 65 | 	{
 66 | 		delete[] m_left_pass_factor;
 67 | 		m_left_pass_factor = nullptr;
 68 | 	}
 69 | 
 70 | 	if (m_right_pass_color)
 71 | 	{
 72 | 		delete[] m_right_pass_color;
 73 | 		m_right_pass_color = nullptr;
 74 | 	}
 75 | 
 76 | 	if (m_right_pass_factor)
 77 | 	{
 78 | 		delete[] m_right_pass_factor;
 79 | 		m_right_pass_factor = nullptr;
 80 | 	}
 81 | 
 82 | 	if (m_down_pass_color)
 83 | 	{
 84 | 		delete[] m_down_pass_color;
 85 | 		m_down_pass_color = nullptr;
 86 | 	}
 87 | 
 88 | 	if (m_down_pass_factor)
 89 | 	{
 90 | 		delete[] m_down_pass_factor;
 91 | 		m_down_pass_factor = nullptr;
 92 | 	}
 93 | 
 94 | 	if (m_up_pass_color)
 95 | 	{
 96 | 		delete[] m_up_pass_color;
 97 | 		m_up_pass_color = nullptr;
 98 | 	}
 99 | 
100 | 	if (m_up_pass_factor)
101 | 	{
102 | 		delete[] m_up_pass_factor;
103 | 		m_up_pass_factor = nullptr;
104 | 	}
105 | }
106 | 
107 | int CRBFilterPlain::getDiffFactor(const unsigned char* color1, const unsigned char* color2) const
108 | {
109 | 	int final_diff;
110 | 	int component_diff[4];
111 | 
112 | 	// find absolute difference between each component
113 | 	for (int i = 0; i < m_reserve_channels; i++)
114 | 	{
115 | 		component_diff[i] = abs(color1[i] - color2[i]);
116 | 	}
117 | 
118 | 	// based on number of components, produce a single difference value in the 0-255 range
119 | 	switch (m_reserve_channels)
120 | 	{
121 | 	case 1:
122 | 		final_diff = component_diff[0];
123 | 		break;
124 | 
125 | 	case 2:
126 | 		final_diff = ((component_diff[0] + component_diff[1]) >> 1);
127 | 		break;
128 | 
129 | 	case 3:
130 | 		final_diff = ((component_diff[0] + component_diff[2]) >> 2) + (component_diff[1] >> 1);
131 | 		break;
132 | 
133 | 	case 4:
134 | 		final_diff = ((component_diff[0] + component_diff[1] + component_diff[2] + component_diff[3]) >> 2);
135 | 		break;
136 | 
137 | 	default:
138 | 		final_diff = 0;
139 | 	}
140 | 
141 | 	_ASSERT(final_diff >= 0 && final_diff <= 255);
142 | 
143 | 	return final_diff;
144 | }
145 | 
146 | // memory must be reserved before calling image filter
147 | // this implementation of filter uses plain C++, single threaded
148 | // channel count must be 3 or 4 (alpha not used)
149 | void CRBFilterPlain::filter(unsigned char* img_src, unsigned char* img_dst,
150 | 	float sigma_spatial, float sigma_range,
151 | 	int width, int height, int channel)
152 | {
153 | 	_ASSERT(img_src);
154 | 	_ASSERT(img_dst);
155 | 	_ASSERT(m_reserve_channels == channel);
156 | 	_ASSERT(m_reserve_width >= width);
157 | 	_ASSERT(m_reserve_height >= height);
158 | 
159 | 	// compute a lookup table
160 | 	float alpha_f = static_cast<float>(exp(-sqrt(2.0) / (sigma_spatial * 255)));
161 | 	float inv_alpha_f = 1.f - alpha_f;
162 | 
163 | 
164 | 	float range_table_f[QX_DEF_CHAR_MAX + 1];
165 | 	float inv_sigma_range = 1.0f / (sigma_range * QX_DEF_CHAR_MAX);
166 | 	{
167 | 		float ii = 0.f;
168 | 		for (int i = 0; i <= QX_DEF_CHAR_MAX; i++, ii -= 1.f)
169 | 		{
170 | 			range_table_f[i] = alpha_f * exp(ii * inv_sigma_range);
171 | 		}
172 | 	}
173 | 
174 | 	///////////////
175 | 	// Left pass
176 | 	{
177 | 		const unsigned char* src_color = img_src;
178 | 		float* left_pass_color = m_left_pass_color;
179 | 		float* left_pass_factor = m_left_pass_factor;
180 | 
181 | 		for (int y = 0; y < height; y++)
182 | 		{
183 | 			const unsigned char* src_prev = src_color;
184 | 			const float* prev_factor = left_pass_factor;
185 | 			const float* prev_color = left_pass_color;
186 | 
187 | 			// process 1st pixel separately since it has no previous
188 | 			*left_pass_factor++ = 1.f;
189 | 			for (int c = 0; c < channel; c++)
190 | 			{
191 | 				*left_pass_color++ = *src_color++;
192 | 			}
193 | 
194 | 			// handle other pixels
195 | 			for (int x = 1; x < width; x++)
196 | 			{
197 | 				// determine difference in pixel color between current and previous
198 | 				// calculation is different depending on number of channels
199 | 				int diff = getDiffFactor(src_color, src_prev);
200 | 				src_prev = src_color;
201 | 
202 | 				float alpha_f = range_table_f[diff];
203 | 
204 | 				*left_pass_factor++ = inv_alpha_f + alpha_f * (*prev_factor++);
205 | 
206 | 				for (int c = 0; c < channel; c++)
207 | 				{
208 | 					*left_pass_color++ = inv_alpha_f * (*src_color++) + alpha_f * (*prev_color++);
209 | 				}
210 | 			}
211 | 		}
212 | 	}
213 | 
214 | 	///////////////
215 | 	// Right pass
216 | 	{
217 | 		// start from end and then go up to begining 
218 | 		int last_index = width * height * channel - 1;
219 | 		const unsigned char* src_color = img_src + last_index;
220 | 		float* right_pass_color = m_right_pass_color + last_index;
221 | 		float* right_pass_factor = m_right_pass_factor + width * height - 1;
222 | 
223 | 		for (int y = 0; y < height; y++)
224 | 		{
225 | 			const unsigned char* src_prev = src_color;
226 | 			const float* prev_factor = right_pass_factor;
227 | 			const float* prev_color = right_pass_color;
228 | 
229 | 			// process 1st pixel separately since it has no previous
230 | 			*right_pass_factor-- = 1.f;
231 | 			for (int c = 0; c < channel; c++)
232 | 			{
233 | 				*right_pass_color-- = *src_color--;
234 | 			}
235 | 
236 | 			// handle other pixels
237 | 			for (int x = 1; x < width; x++)
238 | 			{
239 | 				// determine difference in pixel color between current and previous
240 | 				// calculation is different depending on number of channels
241 | 				int diff = getDiffFactor(src_color, src_color - 3);
242 | 				//	src_prev = src_color;
243 | 
244 | 				float alpha_f = range_table_f[diff];
245 | 
246 | 				*right_pass_factor-- = inv_alpha_f + alpha_f * (*prev_factor--);
247 | 
248 | 				for (int c = 0; c < channel; c++)
249 | 				{
250 | 					*right_pass_color-- = inv_alpha_f * (*src_color--) + alpha_f * (*prev_color--);
251 | 				}
252 | 			}
253 | 		}
254 | 	}
255 | 
256 | 	// vertical pass will be applied on top on horizontal pass, while using pixel differences from original image
257 | 	// result color stored in 'm_left_pass_color' and vertical pass will use it as source color
258 | 	{
259 | 		float* img_out = m_left_pass_color; // use as temporary buffer
260 | 		const float* left_pass_color = m_left_pass_color;
261 | 		const float* left_pass_factor = m_left_pass_factor;
262 | 		const float* right_pass_color = m_right_pass_color;
263 | 		const float* right_pass_factor = m_right_pass_factor;
264 | 
265 | 		int width_height = width * height;
266 | 		for (int i = 0; i < width_height; i++)
267 | 		{
268 | 			// average color divided by average factor
269 | 			float factor = 1.f / ((*left_pass_factor++) + (*right_pass_factor++));
270 | 			for (int c = 0; c < channel; c++)
271 | 			{
272 | 				*img_out++ = (factor * ((*left_pass_color++) + (*right_pass_color++)));
273 | 			}
274 | 		}
275 | 	}
276 | 
277 | 	///////////////
278 | 	// Down pass
279 | 	{
280 | 		const float* src_color_hor = m_left_pass_color; // result of horizontal pass filter
281 | 
282 | 		const unsigned char* src_color = img_src;
283 | 		float* down_pass_color = m_down_pass_color;
284 | 		float* down_pass_factor = m_down_pass_factor;
285 | 
286 | 		const unsigned char* src_prev = src_color;
287 | 		const float* prev_color = down_pass_color;
288 | 		const float* prev_factor = down_pass_factor;
289 | 
290 | 		// 1st line done separately because no previous line
291 | 		for (int x = 0; x < width; x++)
292 | 		{
293 | 			*down_pass_factor++ = 1.f;
294 | 			for (int c = 0; c < channel; c++)
295 | 			{
296 | 				*down_pass_color++ = *src_color_hor++;
297 | 			}
298 | 			src_color += channel;
299 | 		}
300 | 
301 | 		// handle other lines
302 | 		for (int y = 1; y < height; y++)
303 | 		{
304 | 			for (int x = 0; x < width; x++)
305 | 			{
306 | 				// determine difference in pixel color between current and previous
307 | 				// calculation is different depending on number of channels
308 | 				int diff = getDiffFactor(src_color, src_prev);
309 | 				src_prev += channel;
310 | 				src_color += channel;
311 | 
312 | 				float alpha_f = range_table_f[diff];
313 | 
314 | 				*down_pass_factor++ = inv_alpha_f + alpha_f * (*prev_factor++);
315 | 
316 | 				for (int c = 0; c < channel; c++)
317 | 				{
318 | 					*down_pass_color++ = inv_alpha_f * (*src_color_hor++) + alpha_f * (*prev_color++);
319 | 				}
320 | 			}
321 | 		}
322 | 	}
323 | 
324 | 	///////////////
325 | 	// Up pass
326 | 	{
327 | 		// start from end and then go up to begining 
328 | 		int last_index = width * height * channel - 1;
329 | 		const unsigned char* src_color = img_src + last_index;
330 | 		const float* src_color_hor = m_left_pass_color + last_index; // result of horizontal pass filter
331 | 		float* up_pass_color = m_up_pass_color + last_index;
332 | 		float* up_pass_factor = m_up_pass_factor + (width * height - 1);
333 | 
334 | 		//	const unsigned char* src_prev = src_color;
335 | 		const float* prev_color = up_pass_color;
336 | 		const float* prev_factor = up_pass_factor;
337 | 
338 | 		// 1st line done separately because no previous line
339 | 		for (int x = 0; x < width; x++)
340 | 		{
341 | 			*up_pass_factor-- = 1.f;
342 | 			for (int c = 0; c < channel; c++)
343 | 			{
344 | 				*up_pass_color-- = *src_color_hor--;
345 | 			}
346 | 			src_color -= channel;
347 | 		}
348 | 
349 | 		// handle other lines
350 | 		for (int y = 1; y < height; y++)
351 | 		{
352 | 			for (int x = 0; x < width; x++)
353 | 			{
354 | 				// determine difference in pixel color between current and previous
355 | 				// calculation is different depending on number of channels
356 | 				src_color -= channel;
357 | 				int diff = getDiffFactor(src_color, src_color + width * channel);
358 | 
359 | 				float alpha_f = range_table_f[diff];
360 | 
361 | 				*up_pass_factor-- = inv_alpha_f + alpha_f * (*prev_factor--);
362 | 
363 | 				for (int c = 0; c < channel; c++)
364 | 				{
365 | 					*up_pass_color-- = inv_alpha_f * (*src_color_hor--) + alpha_f * (*prev_color--);
366 | 				}
367 | 			}
368 | 		}
369 | 	}
370 | 
371 | 	///////////////
372 | 	// average result of vertical pass is written to output buffer
373 | 	{
374 | 		const float* down_pass_color = m_down_pass_color;
375 | 		const float* down_pass_factor = m_down_pass_factor;
376 | 		const float* up_pass_color = m_up_pass_color;
377 | 		const float* up_pass_factor = m_up_pass_factor;
378 | 
379 | 		int width_height = width * height;
380 | 		for (int i = 0; i < width_height; i++)
381 | 		{
382 | 			// average color divided by average factor
383 | 			float factor = 1.f / ((*up_pass_factor++) + (*down_pass_factor++));
384 | 			for (int c = 0; c < channel; c++)
385 | 			{
386 | 				*img_dst++ = (unsigned char)(factor * ((*up_pass_color++) + (*down_pass_color++)));
387 | 			}
388 | 		}
389 | 	}
390 | }
391 | 


--------------------------------------------------------------------------------
/rbf.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef INCLUDE_RBF
  2 | #define INCLUDE_RBF
  3 | #include <math.h>
  4 | #include <string.h>
  5 | #define QX_DEF_CHAR_MAX 255
  6 | 
  7 | /* ======================================================================
  8 | 
  9 | RecursiveBF: A lightweight library for recursive bilateral filtering.
 10 | 
 11 | -------------------------------------------------------------------------
 12 | 
 13 | Intro:      Recursive bilateral filtering (developed by Qingxiong Yang) 
 14 |             is pretty fast compared with most edge-preserving filtering 
 15 |             methods.
 16 | 
 17 |             -   computational complexity is linear in both input size and 
 18 |                 dimensionality
 19 |             -   takes about 43 ms to process a one mega-pixel color image
 20 |                 (i7 1.8GHz & 4GB memory)
 21 |             -   about 18x faster than Fast high-dimensional filtering 
 22 |                 using the permutohedral lattice
 23 |             -   about 86x faster than Gaussian kd-trees for fast high-
 24 |                 dimensional filtering
 25 | 
 26 | 
 27 | Usage:      // ----------------------------------------------------------
 28 |             // Basic Usage
 29 |             // ----------------------------------------------------------
 30 | 
 31 |             unsigned char * img = ...;                    // input image
 32 |             unsigned char * img_out = 0;            // output image
 33 |             int width = ..., height = ..., channel = ...; // image size
 34 |             recursive_bf(img, img_out, 
 35 |                          sigma_spatial, sigma_range, 
 36 |                          width, height, channel);
 37 | 
 38 |             // ----------------------------------------------------------
 39 |             // Advanced: using external buffer for better performance
 40 |             // ----------------------------------------------------------
 41 | 
 42 |             unsigned char * img = ...;                    // input image
 43 |             unsigned char * img_out = 0;            // output image
 44 |             int width = ..., height = ..., channel = ...; // image size
 45 |             float * buffer = new float[                   // external buf
 46 |                                  ( width * height* channel 
 47 |                                  + width * height
 48 |                                  + width * channel 
 49 |                                  + width) * 2];
 50 |             recursive_bf(img, img_out, 
 51 |                          sigma_spatial, sigma_range, 
 52 |                          width, height, channel, 
 53 |                          buffer);
 54 |             delete[] buffer;
 55 | 
 56 | 
 57 | Notice:     Large sigma_spatial/sigma_range parameter may results in 
 58 |             visible artifact which can be removed by an additional 
 59 |             filter with small sigma_spatial/sigma_range parameter.
 60 | 
 61 | -------------------------------------------------------------------------
 62 | 
 63 | Reference:  Qingxiong Yang, Recursive Bilateral Filtering,
 64 |             European Conference on Computer Vision (ECCV) 2012, 399-413.
 65 | 
 66 | ====================================================================== */
 67 | 
 68 | inline void recursive_bf(
 69 |     unsigned char * img_in, 
 70 |     unsigned char *& img_out, 
 71 |     float sigma_spatial, float sigma_range, 
 72 |     int width, int height, int channel, 
 73 |     float * buffer /*= 0*/);
 74 | 
 75 | // ----------------------------------------------------------------------
 76 | 
 77 | inline void _recursive_bf(
 78 |     unsigned char * img,
 79 |     float sigma_spatial, float sigma_range, 
 80 |     int width, int height, int channel,
 81 |     float * buffer = 0)
 82 | {
 83 |     const int width_height = width * height;
 84 |     const int width_channel = width * channel;
 85 |     const int width_height_channel = width * height * channel;
 86 | 
 87 |     bool is_buffer_internal = (buffer == 0);
 88 |     if (is_buffer_internal)
 89 |         buffer = new float[(width_height_channel + width_height 
 90 |                             + width_channel + width) * 2];
 91 | 
 92 |     float * img_out_f = buffer;
 93 |     float * img_temp = &img_out_f[width_height_channel];
 94 |     float * map_factor_a = &img_temp[width_height_channel];
 95 |     float * map_factor_b = &map_factor_a[width_height]; 
 96 |     float * slice_factor_a = &map_factor_b[width_height];
 97 |     float * slice_factor_b = &slice_factor_a[width_channel];
 98 |     float * line_factor_a = &slice_factor_b[width_channel];
 99 |     float * line_factor_b = &line_factor_a[width];
100 |     
101 |     //compute a lookup table
102 |     float range_table[QX_DEF_CHAR_MAX + 1];
103 |     float inv_sigma_range = 1.0f / (sigma_range * QX_DEF_CHAR_MAX);
104 |     for (int i = 0; i <= QX_DEF_CHAR_MAX; i++) 
105 |         range_table[i] = static_cast<float>(exp(-i * inv_sigma_range));
106 | 
107 |     float alpha = static_cast<float>(exp(-sqrt(2.0) / (sigma_spatial * width)));
108 |     float ypr, ypg, ypb, ycr, ycg, ycb;
109 |     float fp, fc;
110 |     float inv_alpha_ = 1 - alpha;
111 |     for (int y = 0; y < height; y++)
112 |     {
113 |         float * temp_x = &img_temp[y * width_channel];
114 |         unsigned char * in_x = &img[y * width_channel];
115 |         unsigned char * texture_x = &img[y * width_channel];
116 |         *temp_x++ = ypr = *in_x++; 
117 |         *temp_x++ = ypg = *in_x++; 
118 |         *temp_x++ = ypb = *in_x++;
119 |         unsigned char tpr = *texture_x++; 
120 |         unsigned char tpg = *texture_x++;
121 |         unsigned char tpb = *texture_x++;
122 | 
123 |         float * temp_factor_x = &map_factor_a[y * width];
124 |         *temp_factor_x++ = fp = 1;
125 | 
126 |         // from left to right
127 |         for (int x = 1; x < width; x++) 
128 |         {
129 |             unsigned char tcr = *texture_x++; 
130 |             unsigned char tcg = *texture_x++; 
131 |             unsigned char tcb = *texture_x++;
132 |             unsigned char dr = abs(tcr - tpr);
133 |             unsigned char dg = abs(tcg - tpg);
134 |             unsigned char db = abs(tcb - tpb);
135 |             int range_dist = (((dr << 1) + dg + db) >> 2);
136 |             float weight = range_table[range_dist];
137 |             float alpha_ = weight*alpha;
138 |             *temp_x++ = ycr = inv_alpha_*(*in_x++) + alpha_*ypr; 
139 |             *temp_x++ = ycg = inv_alpha_*(*in_x++) + alpha_*ypg; 
140 |             *temp_x++ = ycb = inv_alpha_*(*in_x++) + alpha_*ypb;
141 |             tpr = tcr; tpg = tcg; tpb = tcb;
142 |             ypr = ycr; ypg = ycg; ypb = ycb;
143 |             *temp_factor_x++ = fc = inv_alpha_ + alpha_*fp;
144 |             fp = fc;
145 |         }
146 |         *--temp_x; *temp_x = 0.5f*((*temp_x) + (*--in_x));
147 |         *--temp_x; *temp_x = 0.5f*((*temp_x) + (*--in_x));
148 |         *--temp_x; *temp_x = 0.5f*((*temp_x) + (*--in_x));
149 |         tpr = *--texture_x; 
150 |         tpg = *--texture_x; 
151 |         tpb = *--texture_x;
152 |         ypr = *in_x; ypg = *in_x; ypb = *in_x;
153 | 
154 |         *--temp_factor_x; *temp_factor_x = 0.5f*((*temp_factor_x) + 1);
155 |         fp = 1;
156 | 
157 |         // from right to left
158 |         for (int x = width - 2; x >= 0; x--) 
159 |         {
160 |             unsigned char tcr = *--texture_x; 
161 |             unsigned char tcg = *--texture_x; 
162 |             unsigned char tcb = *--texture_x;
163 |             unsigned char dr = abs(tcr - tpr);
164 |             unsigned char dg = abs(tcg - tpg);
165 |             unsigned char db = abs(tcb - tpb);
166 |             int range_dist = (((dr << 1) + dg + db) >> 2);
167 |             float weight = range_table[range_dist];
168 |             float alpha_ = weight * alpha;
169 | 
170 |             ycr = inv_alpha_ * (*--in_x) + alpha_ * ypr; 
171 |             ycg = inv_alpha_ * (*--in_x) + alpha_ * ypg; 
172 |             ycb = inv_alpha_ * (*--in_x) + alpha_ * ypb;
173 |             *--temp_x; *temp_x = 0.5f*((*temp_x) + ycr);
174 |             *--temp_x; *temp_x = 0.5f*((*temp_x) + ycg);
175 |             *--temp_x; *temp_x = 0.5f*((*temp_x) + ycb);
176 |             tpr = tcr; tpg = tcg; tpb = tcb;
177 |             ypr = ycr; ypg = ycg; ypb = ycb;
178 | 
179 |             fc = inv_alpha_ + alpha_*fp;
180 |             *--temp_factor_x; 
181 |             *temp_factor_x = 0.5f*((*temp_factor_x) + fc);
182 |             fp = fc;
183 |         }
184 |     }
185 |     alpha = static_cast<float>(exp(-sqrt(2.0) / (sigma_spatial * height)));
186 |     inv_alpha_ = 1 - alpha;
187 |     float * ycy, * ypy, * xcy;
188 |     unsigned char * tcy, * tpy;
189 |     memcpy(img_out_f, img_temp, sizeof(float)* width_channel);
190 | 
191 |     float * in_factor = map_factor_a;
192 |     float*ycf, *ypf, *xcf;
193 |     memcpy(map_factor_b, in_factor, sizeof(float) * width);
194 |     for (int y = 1; y < height; y++)
195 |     {
196 |         tpy = &img[(y - 1) * width_channel];
197 |         tcy = &img[y * width_channel];
198 |         xcy = &img_temp[y * width_channel];
199 |         ypy = &img_out_f[(y - 1) * width_channel];
200 |         ycy = &img_out_f[y * width_channel];
201 | 
202 |         xcf = &in_factor[y * width];
203 |         ypf = &map_factor_b[(y - 1) * width];
204 |         ycf = &map_factor_b[y * width];
205 |         for (int x = 0; x < width; x++)
206 |         {
207 |             unsigned char dr = abs((*tcy++) - (*tpy++));
208 |             unsigned char dg = abs((*tcy++) - (*tpy++));
209 |             unsigned char db = abs((*tcy++) - (*tpy++));
210 |             int range_dist = (((dr << 1) + dg + db) >> 2);
211 |             float weight = range_table[range_dist];
212 |             float alpha_ = weight*alpha;
213 |             for (int c = 0; c < channel; c++) 
214 |                 *ycy++ = inv_alpha_*(*xcy++) + alpha_*(*ypy++);
215 |             *ycf++ = inv_alpha_*(*xcf++) + alpha_*(*ypf++);
216 |         }
217 |     }
218 |     int h1 = height - 1;
219 |     ycf = line_factor_a;
220 |     ypf = line_factor_b;
221 |     memcpy(ypf, &in_factor[h1 * width], sizeof(float) * width);
222 |     for (int x = 0; x < width; x++) 
223 |         map_factor_b[h1 * width + x] = 0.5f*(map_factor_b[h1 * width + x] + ypf[x]);
224 | 
225 |     ycy = slice_factor_a;
226 |     ypy = slice_factor_b;
227 |     memcpy(ypy, &img_temp[h1 * width_channel], sizeof(float)* width_channel);
228 |     int k = 0; 
229 |     for (int x = 0; x < width; x++) {
230 |         for (int c = 0; c < channel; c++) {
231 |             int idx = (h1 * width + x) * channel + c;
232 |             img_out_f[idx] = 0.5f*(img_out_f[idx] + ypy[k++]) / map_factor_b[h1 * width + x];
233 |         }
234 |     }
235 | 
236 |     for (int y = h1 - 1; y >= 0; y--)
237 |     {
238 |         tpy = &img[(y + 1) * width_channel];
239 |         tcy = &img[y * width_channel];
240 |         xcy = &img_temp[y * width_channel];
241 |         float*ycy_ = ycy;
242 |         float*ypy_ = ypy;
243 |         float*out_ = &img_out_f[y * width_channel];
244 | 
245 |         xcf = &in_factor[y * width];
246 |         float*ycf_ = ycf;
247 |         float*ypf_ = ypf;
248 |         float*factor_ = &map_factor_b[y * width];
249 |         for (int x = 0; x < width; x++)
250 |         {
251 |             unsigned char dr = abs((*tcy++) - (*tpy++));
252 |             unsigned char dg = abs((*tcy++) - (*tpy++));
253 |             unsigned char db = abs((*tcy++) - (*tpy++));
254 |             int range_dist = (((dr << 1) + dg + db) >> 2);
255 |             float weight = range_table[range_dist];
256 |             float alpha_ = weight*alpha;
257 | 
258 |             float fcc = inv_alpha_*(*xcf++) + alpha_*(*ypf_++);
259 |             *ycf_++ = fcc;
260 |             *factor_ = 0.5f * (*factor_ + fcc);
261 | 
262 |             for (int c = 0; c < channel; c++)
263 |             {
264 |                 float ycc = inv_alpha_*(*xcy++) + alpha_*(*ypy_++);
265 |                 *ycy_++ = ycc;
266 |                 *out_ = 0.5f * (*out_ + ycc) / (*factor_);
267 |                 *out_++;
268 |             }
269 |             *factor_++;
270 |         }
271 |         memcpy(ypy, ycy, sizeof(float) * width_channel);
272 |         memcpy(ypf, ycf, sizeof(float) * width);
273 |     }
274 | 
275 |     for (int i = 0; i < width_height_channel; ++i)
276 |         img[i] = static_cast<unsigned char>(img_out_f[i]);
277 | 
278 |     if (is_buffer_internal)
279 |         delete[] buffer;
280 | }
281 | 
282 | 
283 | inline void recursive_bf(
284 |     unsigned char * img_in,
285 |     unsigned char *& img_out,
286 |     float sigma_spatial, float sigma_range,
287 |     int width, int height, int channel,
288 |     float * buffer = 0)
289 | {
290 |     if (img_out == 0)
291 |         img_out = new unsigned char[width * height * channel];
292 |     for (int i = 0; i < width * height * channel; ++i)
293 |         img_out[i] = img_in[i];
294 |     _recursive_bf(img_out, sigma_spatial, sigma_range, width, height, channel, buffer);
295 | }
296 | 
297 | #endif // INCLUDE_RBF
298 | 


--------------------------------------------------------------------------------
/RecursiveBilateralFilter.cpp:
--------------------------------------------------------------------------------
  1 | // Purpose of this file is to run a series of tests on several images using different implementations of the
  2 | // Recursive Bilaterial Filter, and to show rough time estimate for each run
  3 | 
  4 | #include "stdafx.h"
  5 | #include "stb_image.h"
  6 | #include "stb_image_write.h"
  7 | #include "rbf.hpp"
  8 | #include <time.h>
  9 | #include <iostream>
 10 | #include <time.h>
 11 | #include "RBFilter_SSE2.h"
 12 | #include "RBFilter_AVX2.h"
 13 | #include <iomanip>
 14 | 
 15 | using namespace std;
 16 | 
 17 | // main filter strength controls
 18 | const float sigma_spatial = 0.12f;
 19 | const float sigma_range = 0.09f;
 20 | 
 21 | // number of test runs per image, for better average time measurement
 22 | // if running debug mode, use small number so it's faster
 23 | #ifdef _DEBUG
 24 | const int test_runs = 1;  
 25 | #else
 26 | const int test_runs = 100;
 27 | #endif
 28 | 
 29 | // path where files are located, you may need to change this
 30 | const char images_folder_path[] = "./images/";
 31 | 
 32 | // test images:
 33 | const char file_name_testGirl[] = "testGirl.jpg";		// size: 448 x 626
 34 | const char file_name_house[] = "Thefarmhouse.jpg";		// size: 1440 x 1080
 35 | const char file_name_testpattern[] = "testpatern5.png"; // size: 1920 x 1080
 36 | 
 37 | 
 38 | // timer uses 'test_runs' as divisor
 39 | class TestRunTimer 
 40 | {
 41 | 	clock_t begTime;
 42 | 
 43 | public:
 44 | 	void start() { begTime = clock(); }
 45 | 	float elapsedTimeMS() { return float(clock() - begTime) / (float)test_runs; }
 46 | };
 47 | 
 48 | // utility for setting output file name
 49 | template <size_t _Size>
 50 | char* modifyFilePath(char (&file_path)[_Size], const char* suffix)
 51 | {
 52 | 	size_t l = strlen(file_path);
 53 | 	// get rid of old extension
 54 | 	for (size_t i = l - 1; i > 0; i--)
 55 | 	{
 56 | 		if (file_path[i] == '.')
 57 | 		{
 58 | 			file_path[i] = 0;
 59 | 			break;
 60 | 		}
 61 | 	}
 62 | 
 63 | 	// add current sigma values just for clarity
 64 | 	char extra_text[64];
 65 | 	sprintf_s(extra_text, "%0.3f_%0.3f", sigma_spatial, sigma_range);
 66 | 
 67 | 	// add suffix
 68 | 	strcat_s(file_path, "_");
 69 | 	strcat_s(file_path, suffix);
 70 | 	strcat_s(file_path, "_");
 71 | 	strcat_s(file_path, extra_text);
 72 | 	strcat_s(file_path, ".png"); // force PNG format
 73 | 
 74 | 	return file_path;
 75 | }
 76 | 
 77 | // using original implementation, source code from
 78 | // https://github.com/ufoym/RecursiveBF
 79 | void testRunRecursiveBF_Original(const char* image_name)
 80 | {
 81 | 	cout << "\nImage: " << image_name;
 82 | 	char file_path[256];
 83 | 	strcpy_s(file_path, images_folder_path);
 84 | 	strcat_s(file_path, image_name);
 85 | 
 86 | 	int width, height, channel;
 87 | 	unsigned char * img = stbi_load(file_path, &width, &height, &channel, 3);
 88 | 	if (!img)
 89 | 	{
 90 | 		cout << "\nFailed to load image path: " << file_path;
 91 | 		return;
 92 | 	}
 93 | 	cout << ", size: " << width << " x " << height;
 94 | 	channel = 3; // require 3 channel for this test
 95 | 	unsigned char * img_out = nullptr;
 96 | 	TestRunTimer timer;
 97 | 
 98 | 	// memory reserve for filter algorithm before timer start
 99 | 	float * buffer = new float[(width * height* channel + width * height + width * channel + width) * 2];
100 | 
101 | 	timer.start();
102 | 	for (int i = 0; i < test_runs; ++i)
103 | 		recursive_bf(img, img_out, sigma_spatial, sigma_range, width, height, channel, buffer);
104 | 	
105 | 	cout << ", time ms: " << timer.elapsedTimeMS();
106 | 	
107 | 	delete[] buffer;
108 | 
109 | 	modifyFilePath(file_path, "RBF");
110 | 	stbi_write_png(file_path, width, height, channel, img_out, width * 3);
111 | 
112 | 	delete[] img;
113 | 	delete[] img_out;
114 | }
115 | 
116 | 
117 | // using optimized SSE2 with optional multithreading, single stage (non-pipelined)
118 | void testRunRecursiveBF_SSE2_mt(const char* image_name, int thread_count)
119 | {
120 | 	cout << "\nImage: " << image_name;
121 | 	char file_path[256];
122 | 	strcpy_s(file_path, images_folder_path);
123 | 	strcat_s(file_path, image_name);
124 | 
125 | 	int width, height, channel;
126 | 	unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4);
127 | 	if (!img)
128 | 	{
129 | 		cout << "\nFailed to load image path: " << file_path;
130 | 		return;
131 | 	}
132 | 	cout << ", size: " << width << " x " << height;
133 | 	channel = 4; // require 4 channel for this test
134 | 
135 | 	CRBFilterSSE2 rbf_object;
136 | 	bool success = rbf_object.initialize(width, height, thread_count, false);
137 | 	if (!success)
138 | 	{
139 | 		cout << "\nCRBFilterSSE2 failed to initialize for some reason";
140 | 		delete[] img;
141 | 		return;
142 | 	}
143 | 	rbf_object.setSigma(sigma_spatial, sigma_range);
144 | 
145 | 	unsigned char * img_out = new unsigned char[width * height * 4];
146 | 
147 | 
148 | 	TestRunTimer timer;
149 | 	timer.start();
150 | 	
151 | 	for (int i = 0; i < test_runs; ++i)
152 | 		success = rbf_object.filter(img_out, img, width, height, width * 4);
153 | 
154 | 	if (success)
155 | 	{
156 | 		cout << ", time ms: " << timer.elapsedTimeMS();
157 | 	}
158 | 	else // fail
159 | 	{
160 | 		cout << "\nCRBFilterSSE2::filter failed for some reason";
161 | 	}
162 | 
163 | 	char suffix[64];
164 | 	sprintf_s(suffix, "SSE2_%dt", thread_count);
165 | 	modifyFilePath(file_path, suffix);
166 | 	stbi_write_png(file_path, width, height, channel, img_out, width * 4);
167 | 
168 | 	delete[] img;
169 | 	delete[] img_out;
170 | }
171 | 
172 | // using optimized SSE2 with optional multithreading, pipelined 2 stages
173 | void testRunRecursiveBF_SSE2_Pipelined(const char* image_name, int thread_count)
174 | {
175 | 	cout << "\nImage: " << image_name;
176 | 	char file_path[256];
177 | 	strcpy_s(file_path, images_folder_path);
178 | 	strcat_s(file_path, image_name);
179 | 
180 | 	int width, height, channel;
181 | 	unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4);
182 | 	if (!img)
183 | 	{
184 | 		cout << "\nFailed to load image path: " << file_path;
185 | 		return;
186 | 	}
187 | 	cout << ", size: " << width << " x " << height;
188 | 	channel = 4; // require 4 channel for this test
189 | 
190 | 	CRBFilterSSE2 rbf_object;
191 | 	bool success = rbf_object.initialize(width, height, thread_count, true);
192 | 	if (!success)
193 | 	{
194 | 		cout << "\nCRBFilterSSE2 failed to initialize for some reason";
195 | 		delete[] img;
196 | 		return;
197 | 	}
198 | 	rbf_object.setSigma(sigma_spatial, sigma_range);
199 | 
200 | 	// need 2 output buffers, one for each stage
201 | 	unsigned char * img_out[2];
202 | 	img_out[0] = new unsigned char[width * height * 4];
203 | 	img_out[1] = new unsigned char[width * height * 4];
204 | 
205 | 	TestRunTimer timer;
206 | 	timer.start();
207 | 
208 | 	for (int i = 0; i < test_runs; ++i)
209 | 		success = rbf_object.filterPipePush(img_out[i&1], img, width, height, width * 4);
210 | 	
211 | 	rbf_object.filterPipeFlush();
212 | 
213 | 	if (success)
214 | 	{
215 | 		cout << ", time ms: " << timer.elapsedTimeMS();
216 | 	}
217 | 	else // fail
218 | 	{
219 | 		cout << "\nCRBFilterSSE2::filterPipePush failed for some reason";
220 | 	}
221 | 
222 | 	char suffix[64];
223 | 	sprintf_s(suffix, "SSE2_Pipe_%dt", thread_count);
224 | 	modifyFilePath(file_path, suffix);
225 | 	stbi_write_png(file_path, width, height, channel, img_out[0], width * 4);
226 | 
227 | 	delete[] img;
228 | 	delete[] img_out[0];
229 | 	delete[] img_out[1];
230 | }
231 | 
232 | 
233 | // using optimized AVX2 with optional multithreading, single stage (non-pipelined)
234 | void testRunRecursiveBF_AVX2_mt(const char* image_name, int thread_count)
235 | {
236 | 	cout << "\nImage: " << image_name;
237 | 	char file_path[256];
238 | 	strcpy_s(file_path, images_folder_path);
239 | 	strcat_s(file_path, image_name);
240 | 
241 | 	int width, height, channel;
242 | 	unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4);
243 | 	if (!img)
244 | 	{
245 | 		cout << "\nFailed to load image path: " << file_path;
246 | 		return;
247 | 	}
248 | 	cout << ", size: " << width << " x " << height;
249 | 	channel = 4; // require 4 channel for this test
250 | 
251 | 	CRBFilterAVX2 rbf_object;
252 | 	bool success = rbf_object.initialize(width, height, thread_count, false);
253 | 	if (!success)
254 | 	{
255 | 		cout << "\nCRBFilterAVX2 failed to initialize for some reason";
256 | 		delete[] img;
257 | 		return;
258 | 	}
259 | 	rbf_object.setSigma(sigma_spatial, sigma_range);
260 | 
261 | 	int pitch = rbf_object.getOptimalPitch(width);
262 | 	unsigned char * img_out;
263 | 
264 | 	// setup 32 byte aligned memory buffers for input and output, using optimal pitch
265 | 	{
266 | 		img_out = (unsigned char*)_aligned_malloc(pitch * height, 32);
267 | 
268 | 		// move source image to aligned memory
269 | 		unsigned char* buffer = (unsigned char*)_aligned_malloc(pitch * height, 32);
270 | 		for (int y = 0; y < height; y++)
271 | 		{
272 | 			memcpy(buffer + y * pitch, img + y * width * 4, width * 4);
273 | 		}
274 | 		delete[] img;
275 | 		img = buffer;
276 | 	}
277 | 
278 | 	TestRunTimer timer;
279 | 	timer.start();
280 | 
281 | 	for (int i = 0; i < test_runs; ++i)
282 | 		success = rbf_object.filter(img_out, img, width, height, pitch);
283 | 
284 | 	if (success)
285 | 	{
286 | 		cout << ", time ms: " << timer.elapsedTimeMS();
287 | 	}
288 | 	else // fail
289 | 	{
290 | 		cout << "\nCRBFilterAVX2::filter failed for some reason";
291 | 	}
292 | 
293 | 	char suffix[64];
294 | 	sprintf_s(suffix, "AVX2_%dt", thread_count);
295 | 	modifyFilePath(file_path, suffix);
296 | 	stbi_write_png(file_path, width, height, channel, img_out, pitch);
297 | 
298 | 	_aligned_free(img);
299 | 	_aligned_free(img_out);
300 | }
301 | 
302 | // using optimized AVX2 with optional multithreading, pipelined 2 stages, memory aligned
303 | void testRunRecursiveBF_AVX2_Pipelined(const char* image_name, int thread_count)
304 | {
305 | 	cout << "\nImage: " << image_name;
306 | 	char file_path[256];
307 | 	strcpy_s(file_path, images_folder_path);
308 | 	strcat_s(file_path, image_name);
309 | 
310 | 	int width, height, channel;
311 | 	unsigned char * img = stbi_load(file_path, &width, &height, &channel, 4);
312 | 	if (!img)
313 | 	{
314 | 		cout << "\nFailed to load image path: " << file_path;
315 | 		return;
316 | 	}
317 | 	cout << ", size: " << width << " x " << height;
318 | 	channel = 4; // require 4 channel for this test
319 | 
320 | 	CRBFilterAVX2 rbf_object;
321 | 	bool success = rbf_object.initialize(width, height, thread_count, true);
322 | 	if (!success)
323 | 	{
324 | 		cout << "\nCRBFilterAVX2 failed to initialize for some reason";
325 | 		delete[] img;
326 | 		return;
327 | 	}
328 | 	rbf_object.setSigma(sigma_spatial, sigma_range);
329 | 
330 | 	int pitch = rbf_object.getOptimalPitch(width);
331 | 	unsigned char* img_out[2];
332 | 
333 | 	// setup 32 byte aligned memory buffers for input and output, using optimal pitch
334 | 	{
335 | 		img_out[0] = (unsigned char*)_aligned_malloc(pitch * height, 32);
336 | 		img_out[1] = (unsigned char*)_aligned_malloc(pitch * height, 32);
337 | 
338 | 		// move source image to aligned memory
339 | 		unsigned char* buffer = (unsigned char*)_aligned_malloc(pitch * height, 32);
340 | 		for (int y = 0; y < height; y++)
341 | 		{
342 | 			memcpy(buffer + y * pitch, img + y * width * 4, width * 4);
343 | 		}
344 | 		delete[] img;
345 | 		img = buffer;
346 | 	}
347 | 
348 | 	TestRunTimer timer;
349 | 	timer.start();
350 | 
351 | 	for (int i = 0; i < test_runs; ++i)
352 | 		success = rbf_object.filterPipePush(img_out[i & 1], img, width, height, width * 4);
353 | 
354 | 	rbf_object.filterPipeFlush();
355 | 
356 | 	if (success)
357 | 	{
358 | 		cout << ", time ms: " << timer.elapsedTimeMS();
359 | 	}
360 | 	else // fail
361 | 	{
362 | 		cout << "\nCRBFilterAVX2::filterPipePush failed for some reason";
363 | 	}
364 | 
365 | 	char suffix[64];
366 | 	sprintf_s(suffix, "AVX2_Pipe_%dt", thread_count);
367 | 	modifyFilePath(file_path, suffix);
368 | 	stbi_write_png(file_path, width, height, channel, img_out[0], pitch);
369 | 
370 | 	_aligned_free(img);
371 | 	_aligned_free(img_out[0]);
372 | 	_aligned_free(img_out[1]);
373 | }
374 | 
375 | /////////////////////////////////////////////////////////////////////////////
376 | 
377 | int main()
378 | {
379 | 	cout << "test run \n";
380 | 	cout << fixed << setprecision(1);
381 | 
382 | 	////////////////////////
383 | 	cout << "\nOriginal Recursive Bilateral Filter implementation";
384 | 	// image: testpattern
385 | 	testRunRecursiveBF_Original(file_name_testpattern);
386 | 	// image: house
387 | 	testRunRecursiveBF_Original(file_name_house);
388 | 	// image: testGirl
389 | 	testRunRecursiveBF_Original(file_name_testGirl);
390 | 
391 | 	
392 | 	////////////////////////
393 | 	cout << "\n\nOptimized SSE2 single threaded, single stage (non-pipelined)";
394 | 	// image: testpattern
395 | 	testRunRecursiveBF_SSE2_mt(file_name_testpattern, 1);
396 | 	// image: house
397 | 	testRunRecursiveBF_SSE2_mt(file_name_house, 1);
398 | 	// image: testGirl
399 | 	testRunRecursiveBF_SSE2_mt(file_name_testGirl, 1);
400 | 
401 | 	////////////////////////
402 | 	cout << "\n\nOptimized SSE2 2x multithreading, single stage (non-pipelined)";
403 | 	// image: testpattern
404 | 	testRunRecursiveBF_SSE2_mt(file_name_testpattern, 2);
405 | 	// image: house
406 | 	testRunRecursiveBF_SSE2_mt(file_name_house, 2);
407 | 	// image: testGirl
408 | 	testRunRecursiveBF_SSE2_mt(file_name_testGirl, 2);
409 | 
410 | 	////////////////////////
411 | 	cout << "\n\nOptimized SSE2 4x multithreading, single stage (non-pipelined)";
412 | 	// image: testpattern
413 | 	testRunRecursiveBF_SSE2_mt(file_name_testpattern, 4);
414 | 	// image: house
415 | 	testRunRecursiveBF_SSE2_mt(file_name_house, 4);
416 | 	// image: testGirl
417 | 	testRunRecursiveBF_SSE2_mt(file_name_testGirl, 4);
418 | 
419 | 	////////////////////////
420 | 	cout << "\n\nOptimized SSE2 4x2 thread pipelined 2 stages";
421 | 	// image: testpattern
422 | 	testRunRecursiveBF_SSE2_Pipelined(file_name_testpattern, 4);
423 | 	// image: house
424 | 	testRunRecursiveBF_SSE2_Pipelined(file_name_house, 4);
425 | 	// image: testGirl
426 | 	testRunRecursiveBF_SSE2_Pipelined(file_name_testGirl, 4);
427 | 
428 | 	////////////////////////
429 | 	cout << "\n\nOptimized AVX2 single threaded, single stage (non-pipelined), memory aligned";
430 | 	// image: testpattern
431 | 	testRunRecursiveBF_AVX2_mt(file_name_testpattern, 1);
432 | 	// image: house
433 | 	testRunRecursiveBF_AVX2_mt(file_name_house, 1);
434 | 	// image: testGirl
435 | 	testRunRecursiveBF_AVX2_mt(file_name_testGirl, 1);
436 | 
437 | 	////////////////////////
438 | 	cout << "\n\nOptimized AVX2 2x multithreading, single stage (non-pipelined), memory aligned";
439 | 	// image: testpattern
440 | 	testRunRecursiveBF_AVX2_mt(file_name_testpattern, 2);
441 | 	// image: house
442 | 	testRunRecursiveBF_AVX2_mt(file_name_house, 2);
443 | 	// image: testGirl
444 | 	testRunRecursiveBF_AVX2_mt(file_name_testGirl, 2);
445 | 
446 | 	////////////////////////
447 | 	cout << "\n\nOptimized AVX2 4x multithreading, single stage (non-pipelined), memory aligned";
448 | 	// image: testpattern
449 | 	testRunRecursiveBF_AVX2_mt(file_name_testpattern, 4);
450 | 	// image: house
451 | 	testRunRecursiveBF_AVX2_mt(file_name_house, 4);
452 | 	// image: testGirl
453 | 	testRunRecursiveBF_AVX2_mt(file_name_testGirl, 4);
454 | 
455 | 	////////////////////////
456 | 	cout << "\n\nOptimized AVX2 4x2 thread pipelined 2 stages, memory aligned";
457 | 	// image: testpattern
458 | 	testRunRecursiveBF_AVX2_Pipelined(file_name_testpattern, 4);
459 | 	// image: house
460 | 	testRunRecursiveBF_AVX2_Pipelined(file_name_house, 4);
461 | 	// image: testGirl
462 | 	testRunRecursiveBF_AVX2_Pipelined(file_name_testGirl, 4);
463 | 
464 | 	cout << "\nFinish";
465 | 	cin.get();
466 | 
467 |     return 0;
468 | }
469 | 
470 | 


--------------------------------------------------------------------------------
/RBFilter_SSE2.cpp:
--------------------------------------------------------------------------------
  1 | #include "stdafx.h"
  2 | #include "RBFilter_SSE2.h"
  3 | #include <math.h>
  4 | #include <malloc.h>
  5 | #include <new>  
  6 | #include <emmintrin.h>
  7 | #include <tmmintrin.h>
  8 | #include <thread>
  9 | 
 10 | 
 11 | #define MAX_RANGE_TABLE_SIZE 255
 12 | #define ALIGN_SIZE 16
 13 | 
 14 | // only 1 of following 2 should be defined
 15 | #define EDGE_COLOR_USE_MAXIMUM
 16 | //#define EDGE_COLOR_USE_ADDITION
 17 | 
 18 | // if EDGE_COLOR_USE_MAXIMUM is defined, then edge color detection works by calculating
 19 | // maximum difference among 3 components (RGB) of 2 colors, which tends to result in lower differences (since only largest among 3 is selected)
 20 | // if EDGE_COLOR_USE_ADDITION is defined, then edge color detection works by calculating
 21 | // sum of all 3 components, while enforcing 255 maximum. This method is much more sensitive to small differences 
 22 | 
 23 | #if defined(EDGE_COLOR_USE_MAXIMUM) && defined(EDGE_COLOR_USE_ADDITION)
 24 | #error Only 1 of those can be defined
 25 | #endif
 26 | 
 27 | #if !defined(EDGE_COLOR_USE_MAXIMUM) && !defined(EDGE_COLOR_USE_ADDITION)
 28 | #error 1 of those must be defined
 29 | #endif
 30 | 
 31 | CRBFilterSSE2::CRBFilterSSE2()
 32 | {
 33 | 	m_range_table = new float[MAX_RANGE_TABLE_SIZE + 1];
 34 | 	memset(m_range_table, 0, (MAX_RANGE_TABLE_SIZE + 1) * sizeof(float));
 35 | }
 36 | 
 37 | CRBFilterSSE2::~CRBFilterSSE2()
 38 | {
 39 | 	release();
 40 | 
 41 | 	delete[] m_range_table;
 42 | }
 43 | 
 44 | bool CRBFilterSSE2::initialize(int width, int height, int thread_count, bool pipelined)
 45 | {
 46 | 	// basic sanity check, not strict
 47 | 	if (width < 16 || width > 10000)
 48 | 		return false;
 49 | 
 50 | 	if (height < 2 || height > 10000)
 51 | 		return false;
 52 | 
 53 | 	if (thread_count < 1 || thread_count > RBF_MAX_THREADS)
 54 | 		return false;
 55 | 	
 56 | 	release();
 57 | 
 58 | 	// round width up to nearest ALIGN_SIZE * thread_count
 59 | 	int round_up = (ALIGN_SIZE / 4) * thread_count;
 60 | 	if (width % round_up)
 61 | 	{
 62 | 		width += round_up - width % round_up;
 63 | 	}
 64 | 	m_reserved_width = width;
 65 | 	m_reserved_height = height;
 66 | 	m_thread_count = thread_count;
 67 | 
 68 | 	m_stage_buffer[0] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE);
 69 | 	if (!m_stage_buffer[0])
 70 | 		return false;
 71 | 
 72 | 	if (pipelined)
 73 | 	{
 74 | 		for (int i = 1; i < STAGE_BUFFER_COUNT; i++)
 75 | 		{
 76 | 			m_stage_buffer[i] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE);
 77 | 			if (!m_stage_buffer[i])
 78 | 				return false;
 79 | 		}
 80 | 	}
 81 | 
 82 | 	m_h_line_cache = new (std::nothrow) float*[m_thread_count];
 83 | 	if (!m_h_line_cache)
 84 | 		return false;
 85 | 
 86 | 	// zero just in case
 87 | 	for (int i = 0; i < m_thread_count; i++)
 88 | 		m_h_line_cache[i] = nullptr;
 89 | 
 90 | 	for (int i = 0; i < m_thread_count; i++)
 91 | 	{
 92 | 		m_h_line_cache[i] = (float*)_aligned_malloc(m_reserved_width * 12 * sizeof(float) , ALIGN_SIZE);
 93 | 		if (!m_h_line_cache[i])
 94 | 			return false;
 95 | 	}
 96 | 
 97 | //	if (m_pipelined)
 98 | 	{
 99 | 		m_v_line_cache = new (std::nothrow) float*[m_thread_count];
100 | 		if (!m_v_line_cache)
101 | 			return false;
102 | 
103 | 		for (int i = 0; i < m_thread_count; i++)
104 | 			m_v_line_cache[i] = nullptr;
105 | 
106 | 		for (int i = 0; i < m_thread_count; i++)
107 | 		{
108 | 			m_v_line_cache[i] = (float*)_aligned_malloc((m_reserved_width * 8 * sizeof(float)) / m_thread_count, ALIGN_SIZE);
109 | 			if (!m_v_line_cache[i])
110 | 				return false;
111 | 		}
112 | 	}
113 | 
114 | 
115 | 	return true;
116 | }
117 | 
118 | void CRBFilterSSE2::release()
119 | {
120 | 	for (int i = 0; i < STAGE_BUFFER_COUNT; i++)
121 | 	{
122 | 		if (m_stage_buffer[i])
123 | 		{
124 | 			_aligned_free(m_stage_buffer[i]);
125 | 			m_stage_buffer[i] = nullptr;
126 | 		}
127 | 	}
128 | 
129 | 	if (m_h_line_cache)
130 | 	{
131 | 		for (int i = 0; i < m_thread_count; i++)
132 | 		{
133 | 			if (m_h_line_cache[i])
134 | 				_aligned_free(m_h_line_cache[i]);
135 | 		}
136 | 		delete[] m_h_line_cache;
137 | 		m_h_line_cache = nullptr;
138 | 	}
139 | 
140 | //	if (m_pipelined)
141 | 	{
142 | 		for (int i = 0; i < m_thread_count; i++)
143 | 		{
144 | 			if (m_v_line_cache[i])
145 | 				_aligned_free(m_v_line_cache[i]);
146 | 		}
147 | 		delete[] m_v_line_cache;
148 | 	}
149 | 	m_v_line_cache = nullptr;
150 | 
151 | 	m_reserved_width = 0;
152 | 	m_reserved_height = 0;
153 | 	m_thread_count = 0;
154 | 	m_pipelined = false;
155 | 	m_filter_counter = 0;
156 | }
157 | 
158 | void CRBFilterSSE2::setSigma(float sigma_spatial, float sigma_range)
159 | {
160 | 	if (m_sigma_spatial != sigma_spatial || m_sigma_range != sigma_range)
161 | 	{
162 | 		m_sigma_spatial = sigma_spatial;
163 | 		m_sigma_range = sigma_range;
164 | 
165 | 		double alpha_f = (exp(-sqrt(2.0) / (sigma_spatial * 255.0)));
166 | 		m_inv_alpha_f = (float)(1.0 - alpha_f);
167 | 		double inv_sigma_range = 1.0 / (sigma_range * MAX_RANGE_TABLE_SIZE);
168 | 		{
169 | 			double ii = 0.f;
170 | 			for (int i = 0; i <= MAX_RANGE_TABLE_SIZE; i++, ii -= 1.0)
171 | 			{
172 | 				m_range_table[i] = (float)(alpha_f * exp(ii * inv_sigma_range));
173 | 			}
174 | 		}
175 | 	}
176 | }
177 | 
178 | // example of edge color difference calculation from original implementation
179 | // idea is to fit maximum edge color difference as single number in 0-255 range
180 | // colors are added then 2 components are scaled 4x while 1 complement is scaled 2x
181 | // this means 1 of the components is more dominant 
182 | 
183 | //int getDiffFactor(const unsigned char* color1, const unsigned char* color2)
184 | //{
185 | //	int c1 = abs(color1[0] - color2[0]);
186 | //	int c2 = abs(color1[1] - color2[1]);
187 | //	int c3 = abs(color1[2] - color2[2]);
188 | //
189 | //	return ((c1 + c3) >> 2) + (c2 >> 1);
190 | //}
191 | 
192 | 
193 | inline void getDiffFactor3x(__m128i pix4, __m128i pix4p, __m128i* diff4x)
194 | {
195 | 	static __m128i byte_mask = _mm_set1_epi32(255);
196 | 
197 | 	// get absolute difference for each component per pixel
198 | 	__m128i diff = _mm_sub_epi8(_mm_max_epu8(pix4, pix4p), _mm_min_epu8(pix4, pix4p));
199 | 
200 | #ifdef EDGE_COLOR_USE_MAXIMUM
201 | 	// get maximum of 3 components
202 | 	__m128i diff_shift1 = _mm_srli_epi32(diff, 8); // 2nd component
203 | 	diff = _mm_max_epu8(diff, diff_shift1);
204 | 	diff_shift1 = _mm_srli_epi32(diff_shift1, 8); // 3rd component
205 | 	diff = _mm_max_epu8(diff, diff_shift1);
206 | 	// skip alpha component
207 | 	diff = _mm_and_si128(diff, byte_mask); // zero out all but 1st byte
208 | #endif
209 | 
210 | #ifdef EDGE_COLOR_USE_ADDITION
211 | 	// add all component differences and saturate 
212 | 	__m128i diff_shift1 = _mm_srli_epi32(diff, 8); // 2nd component
213 | 	diff = _mm_adds_epu8(diff, diff_shift1);
214 | 	diff_shift1 = _mm_srli_epi32(diff_shift1, 8); // 3rd component
215 | 	diff = _mm_adds_epu8(diff, diff_shift1);
216 | 	diff = _mm_and_si128(diff, byte_mask); // zero out all but 1st byte
217 | #endif
218 | 
219 | 	_mm_store_si128(diff4x, diff);
220 | }
221 | 
222 | 
223 | void CRBFilterSSE2::horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch)
224 | {
225 | 	int height_segment = height / m_thread_count;
226 | 	int buffer_offset = thread_index * height_segment * pitch;
227 | 	img_src += buffer_offset;
228 | 	img_dst += buffer_offset;
229 | 
230 | 	if (thread_index + 1 == m_thread_count) // last segment should account for uneven height
231 | 		height_segment += height % m_thread_count;
232 | 
233 | 	float* line_cache = m_h_line_cache[thread_index];
234 | 	const float* range_table = m_range_table;
235 | 
236 | 	__m128 inv_alpha = _mm_set_ps1(m_inv_alpha_f);
237 | 	__m128 half_value = _mm_set_ps1(0.5f);
238 | 	__m128i mask_pack = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
239 | 	__m128i mask_unpack = _mm_setr_epi8(12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1);
240 | 
241 | 	// used to store maximum difference between 2 pixels
242 | 	__declspec(align(16)) long color_diff[4];
243 | 
244 | 	for (int y = 0; y < height_segment; y++)
245 | 	{
246 | 		//////////////////////
247 | 		// right to left pass, results of this pass get stored in 'line_cache'
248 | 		{
249 | 			int pixels_left = width - 1;
250 | 
251 | 			// get end of line buffer
252 | 			float* line_buffer = line_cache + pixels_left * 12;
253 | 
254 | 			///////
255 | 			// handle last pixel in row separately as special case
256 | 			{
257 | 				const unsigned char* last_src = img_src + (y + 1) * pitch - 4;
258 | 
259 | 				// result color
260 | 				line_buffer[8] = (float)last_src[0];
261 | 				line_buffer[9] = (float)last_src[1];
262 | 				line_buffer[10] = (float)last_src[2];
263 | 				line_buffer[11] = (float)last_src[3];
264 | 
265 | 				// premultiplied source
266 | 				// caching pre-multiplied allows saving 1 multiply operation in 2nd pass loop, not a big difference
267 | 				line_buffer[4] = m_inv_alpha_f * line_buffer[8];
268 | 				line_buffer[5] = m_inv_alpha_f * line_buffer[9];
269 | 				line_buffer[6] = m_inv_alpha_f * line_buffer[10];
270 | 				line_buffer[7] = m_inv_alpha_f * line_buffer[11];
271 | 			}
272 | 
273 | 			// "previous" pixel color
274 | 			__m128 pixel_prev = _mm_load_ps(line_buffer + 8);
275 | 			// "previous" pixel factor
276 | 			__m128 alpha_f_prev4 = _mm_set_ps1(1.f);
277 | 
278 | 			///////
279 | 			// handle most middle pixels in 16 byte intervals using xmm registers
280 | 			// process 4x pixels at a time
281 | 			int buffer_inc = y * pitch + (pixels_left - 1) * 4 - 16;
282 | 			const __m128i* src_4xCur = (const __m128i*)(img_src + buffer_inc);
283 | 			const __m128i* src_4xPrev = (const __m128i*)(img_src + buffer_inc + 4);
284 | 
285 | 			while (pixels_left > 0) // outer loop 4x pixel
286 | 			{
287 | 				// load 4x pixel, may read backward past start of buffer, but it's OK since that extra data won't be used
288 | 				__m128i pix4 = _mm_loadu_si128(src_4xCur--);
289 | 				__m128i pix4p = _mm_loadu_si128(src_4xPrev--);
290 | 
291 | 				// get color differences
292 | 				getDiffFactor3x(pix4, pix4p, (__m128i*)color_diff);
293 | 
294 | 				for (int i = 3; i >= 0 && pixels_left-- > 0; i--) // inner loop
295 | 				{
296 | 					float alpha_f = range_table[color_diff[i]];
297 | 					__m128 alpha_f_4x = _mm_set_ps1(alpha_f);
298 | 
299 | 					// cache weights for next filter pass
300 | 					line_buffer -= 12;
301 | 					_mm_store_ps(line_buffer, alpha_f_4x);
302 | 
303 | 					// color factor
304 | 					alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x);
305 | 					alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha);
306 | 
307 | 					// unpack current source pixel
308 | 					__m128i pix1 = _mm_shuffle_epi8(pix4, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
309 | 					pix4 = _mm_slli_si128(pix4, 4); // shift left so next loop unpacks next pixel data 
310 | 					__m128 pixel_F = _mm_cvtepi32_ps(pix1); // convert to floats
311 | 					
312 | 
313 | 					// apply color filter
314 | 					pixel_F = _mm_mul_ps(pixel_F, inv_alpha);
315 | 					_mm_store_ps(line_buffer + 4, pixel_F); // cache pre-multiplied source color for next filter pass
316 | 					alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x);
317 | 					pixel_F = _mm_add_ps(pixel_F, alpha_f_4x);
318 | 
319 | 					// store current color as previous for next cycle
320 | 					pixel_prev = pixel_F;
321 | 
322 | 					// calculate final color
323 | 					pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4);
324 | 
325 | 					// cache filtered color for next filter pass
326 | 					_mm_store_ps(line_buffer + 8, pixel_F);
327 | 				}
328 | 			}
329 | 		}
330 | 
331 | 		//////////////////////
332 | 		// left to right pass
333 | 		{
334 | 			int pixels_left = width - 1;
335 | 
336 | 			// process 4x pixels at a time
337 | 			int buffer_inc = y * pitch;
338 | 			const __m128i* src_4xCur = (const __m128i*)(img_src + buffer_inc + 4); // shifted by 1 pixel
339 | 			const __m128i* src_4xPrev = (const __m128i*)(img_src + buffer_inc);
340 | 
341 | 			// use float type only to enable 4 byte write using MOVSS
342 | 			float* out_result = (float*)(img_dst + buffer_inc + 4); // start at 2nd pixel from left
343 | 
344 | 			const float* line_buffer = line_cache;
345 | 
346 | 			///////
347 | 			// handle first pixel in row separately as special case
348 | 			{
349 | 				unsigned char* first_dst = img_dst + buffer_inc;
350 | 				// average new pixel with one already in output
351 | 				// source color was pre-multipled, so get original
352 | 				float inv_factor = 1.f / m_inv_alpha_f;
353 | 				first_dst[0] = (unsigned char)((line_buffer[4] * inv_factor + line_buffer[8]) * 0.5f);
354 | 				first_dst[1] = (unsigned char)((line_buffer[5] * inv_factor + line_buffer[9]) * 0.5f);
355 | 				first_dst[2] = (unsigned char)((line_buffer[6] * inv_factor + line_buffer[10]) * 0.5f);
356 | 				first_dst[3] = (unsigned char)((line_buffer[7] * inv_factor + line_buffer[11]) * 0.5f);
357 | 			}
358 | 
359 | 			// initialize "previous pixel" with 4 components of last row pixel
360 | 			__m128 pixel_prev = _mm_load_ps(line_buffer + 8);
361 | 			line_buffer += 12;
362 | 			__m128 alpha_f_prev4 = _mm_set_ps1(1.f);
363 | 
364 | 
365 | 			///////
366 | 			// handle most pixels in 16 byte intervals using xmm registers
367 | 			while (pixels_left > 0) // outer loop 4x pixel
368 | 			{
369 | 				for (int i = 0; i <= 3 && pixels_left-- > 0; i++) // inner loop
370 | 				{
371 | 					// load cached factor
372 | 					__m128 alpha_f_4x = _mm_load_ps(line_buffer);
373 | 					line_buffer += 12;
374 | 
375 | 					// color factor
376 | 					alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x);
377 | 					alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha);
378 | 
379 | 					// load current source pixel, pre-multiplied
380 | 					__m128 pixel_F = _mm_load_ps(line_buffer + 4);
381 | 
382 | 
383 | 					// apply color filter
384 | 					alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x);
385 | 					pixel_F = _mm_add_ps(pixel_F, alpha_f_4x);
386 | 
387 | 					// store current color as previous for next cycle
388 | 					pixel_prev = pixel_F;
389 | 
390 | 					// calculate final color
391 | 					pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4);
392 | 
393 | 					// average this result with result from previous pass
394 | 					__m128 prev_pix4 = _mm_load_ps(line_buffer + 8);
395 | 
396 | 					pixel_F = _mm_add_ps(pixel_F, prev_pix4);
397 | 					pixel_F = _mm_mul_ps(pixel_F, half_value);
398 | 
399 | 					// pack float pixel into byte pixel
400 | 					__m128i pixB = _mm_cvtps_epi32(pixel_F); // convert to integer
401 | 					pixB = _mm_shuffle_epi8(pixB, mask_pack);
402 | 					_mm_store_ss(out_result++, _mm_castsi128_ps(pixB));
403 | 
404 | 				}
405 | 			}
406 | 		}
407 | 	}
408 | }
409 | 
410 | 
411 | void CRBFilterSSE2::verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch)
412 | {
413 | 	int width_segment = width / m_thread_count;
414 | 	// make sure width segments round to 16 byte boundary except for last one
415 | 	width_segment -= width_segment % 4;
416 | 	int start_offset = width_segment * thread_index;
417 | 	if (thread_index == m_thread_count - 1) // last one
418 | 		width_segment = width - start_offset;
419 | 
420 | 	int width4 = width_segment / 4;
421 | 
422 | 	// adjust img buffer starting positions
423 | 	img_src += start_offset * 4;
424 | 	img_dst += start_offset * 4;
425 | 
426 | 	float* line_cache = m_v_line_cache[thread_index];
427 | 	const float* range_table = m_range_table;
428 | 
429 | 	__m128 inv_alpha = _mm_set_ps1(m_inv_alpha_f);
430 | 	__m128 half_value = _mm_set_ps1(0.5f);
431 | 	__m128i mask_pack = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
432 | 	__m128i mask_unpack = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1);
433 | 
434 | 	// used to store maximum difference between 2 pixels
435 | 	__declspec(align(16)) long color_diff[4];
436 | 
437 | 	/////////////////
438 | 	// Bottom to top pass first
439 | 	{
440 | 		// last line processed separately since no previous
441 | 		{
442 | 			unsigned char* dst_line = img_dst + (height - 1) * pitch;
443 | 			const unsigned char* src_line = img_src + (height - 1) * pitch;
444 | 			float* line_buffer = line_cache;
445 | 
446 | 			memcpy(dst_line, src_line, width_segment * 4); // copy last line
447 | 
448 | 			// initialize line cache
449 | 			for (int x = 0; x < width_segment; x++)
450 | 			{
451 | 				// set factor to 1
452 | 				line_buffer[0] = 1.f;
453 | 				line_buffer[1] = 1.f;
454 | 				line_buffer[2] = 1.f;
455 | 				line_buffer[3] = 1.f;
456 | 
457 | 				// set result color
458 | 				line_buffer[4] = (float)src_line[0];
459 | 				line_buffer[5] = (float)src_line[1];
460 | 				line_buffer[6] = (float)src_line[2];
461 | 				line_buffer[7] = (float)src_line[3];
462 | 
463 | 				src_line += 4;
464 | 				line_buffer += 8;
465 | 			}
466 | 		}
467 | 
468 | 		// process other lines
469 | 		for (int y = height - 2; y >= 0; y--)
470 | 		{
471 | 			float* dst_line = (float*)(img_dst + y * pitch);
472 | 			float* line_buffer = line_cache;
473 | 
474 | 			__m128i* src_4xCur = (__m128i*)(img_src + y * pitch);
475 | 			__m128i* src_4xPrev = (__m128i*)(img_src + (y + 1) * pitch);
476 | 
477 | 			int pixels_left = width_segment;
478 | 			while (pixels_left > 0)
479 | 			{
480 | 				// may read past end of buffer, but that data won't be used
481 | 				__m128i pix4 = _mm_loadu_si128(src_4xCur++); // load 4x pixel
482 | 				__m128i pix4p = _mm_loadu_si128(src_4xPrev++);
483 | 
484 | 				// get color differences
485 | 				getDiffFactor3x(pix4, pix4p, (__m128i*)color_diff);
486 | 
487 | 				for (int i = 0; i < 4 && pixels_left-- > 0; i++) // inner loop
488 | 				{
489 | 					float alpha_f = range_table[color_diff[i]];
490 | 					__m128 alpha_f_4x = _mm_set_ps1(alpha_f);
491 | 
492 | 					// load previous line color factor
493 | 					__m128 alpha_f_prev4 = _mm_load_ps(line_buffer);
494 | 					// load previous line color
495 | 					__m128 pixel_prev = _mm_load_ps(line_buffer + 4);
496 | 
497 | 					// color factor
498 | 					alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x);
499 | 					alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha);
500 | 
501 | 					// unpack current source pixel
502 | 					__m128i pix1 = _mm_shuffle_epi8(pix4, mask_unpack);
503 | 					pix4 = _mm_srli_si128(pix4, 4); // shift right
504 | 					__m128 pixel_F = _mm_cvtepi32_ps(pix1); // convert to floats
505 | 					
506 | 
507 | 					// apply color filter
508 | 					pixel_F = _mm_mul_ps(pixel_F, inv_alpha);
509 | 					alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x);
510 | 					pixel_F = _mm_add_ps(pixel_F, alpha_f_4x);
511 | 
512 | 					// store current factor and color as previous for next cycle
513 | 					_mm_store_ps(line_buffer, alpha_f_prev4);
514 | 					_mm_store_ps(line_buffer + 4, pixel_F);
515 | 					line_buffer += 8;
516 | 
517 | 					// calculate final color
518 | 					pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4);
519 | 
520 | 					// pack float pixel into byte pixel
521 | 					__m128i pixB = _mm_cvtps_epi32(pixel_F); // convert to integer
522 | 					pixB = _mm_shuffle_epi8(pixB, mask_pack);
523 | 					_mm_store_ss(dst_line++, _mm_castsi128_ps(pixB));
524 | 				}
525 | 			}
526 | 		}
527 | 	}
528 | 
529 | 	/////////////////
530 | 	// Top to bottom pass last
531 | 	{
532 | 		mask_pack = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12);
533 | 
534 | 		// first line handled separately since no previous
535 | 		{
536 | 			unsigned char* dst_line = img_dst;
537 | 			const unsigned char* src_line = img_src;
538 | 			float* line_buffer = line_cache;
539 | 
540 | 			for (int x = 0; x < width_segment; x++)
541 | 			{
542 | 				// average ccurrent destanation color with current source
543 | 				dst_line[0] = (dst_line[0] + src_line[0]) / 2;
544 | 				dst_line[1] = (dst_line[1] + src_line[1]) / 2;
545 | 				dst_line[2] = (dst_line[2] + src_line[2]) / 2;
546 | 				dst_line[3] = (dst_line[3] + src_line[3]) / 2;
547 | 
548 | 				// set factor to 1
549 | 				line_buffer[0] = 1.f;
550 | 				line_buffer[1] = 1.f;
551 | 				line_buffer[2] = 1.f;
552 | 				line_buffer[3] = 1.f;
553 | 
554 | 				// set result color
555 | 				line_buffer[4] = (float)src_line[0];
556 | 				line_buffer[5] = (float)src_line[1];
557 | 				line_buffer[6] = (float)src_line[2];
558 | 				line_buffer[7] = (float)src_line[3];
559 | 
560 | 				dst_line += 4;
561 | 				src_line += 4;
562 | 				line_buffer += 8;
563 | 			}
564 | 		}
565 | 
566 | 		// process other lines
567 | 		for (int y = 1; y < height; y++)
568 | 		{
569 | 			//	const unsigned char* src_line = img_src + y * pitch;
570 | 			float* line_buffer = line_cache;
571 | 
572 | 			__m128i* src_4xCur = (__m128i*)(img_src + y * pitch);
573 | 			__m128i* src_4xPrev = (__m128i*)(img_src + (y - 1) * pitch);
574 | 			__m128i* dst_4x = (__m128i*)(img_dst + y * pitch);
575 | 
576 | 			for (int x = 0; x < width4; x++)
577 | 			{
578 | 				// get color difference
579 | 				__m128i pix4 = _mm_loadu_si128(src_4xCur++); // load 4x pixel
580 | 				__m128i pix4p = _mm_loadu_si128(src_4xPrev++);
581 | 
582 | 				// get color differences
583 | 				getDiffFactor3x(pix4, pix4p, (__m128i*)color_diff);
584 | 
585 | 				__m128i out_pix4;
586 | 				for (int i = 0; i < 4; i++) // inner loop
587 | 				{
588 | 					float alpha_f = range_table[color_diff[i]];
589 | 					__m128 alpha_f_4x = _mm_set_ps1(alpha_f);
590 | 
591 | 					// load previous line color factor
592 | 					__m128 alpha_f_prev4 = _mm_load_ps(line_buffer);
593 | 					// load previous line color
594 | 					__m128 pixel_prev = _mm_load_ps(line_buffer + 4);
595 | 
596 | 					// color factor
597 | 					//	alpha_f_prev = m_inv_alpha_f + alpha_f * alpha_f_prev;
598 | 					alpha_f_prev4 = _mm_mul_ps(alpha_f_prev4, alpha_f_4x);
599 | 					alpha_f_prev4 = _mm_add_ps(alpha_f_prev4, inv_alpha);
600 | 
601 | 					// unpack current source pixel
602 | 					__m128i pix1 = _mm_shuffle_epi8(pix4, mask_unpack);
603 | 					pix4 = _mm_srli_si128(pix4, 4); // shift right
604 | 					__m128 pixel_F = _mm_cvtepi32_ps(pix1); // convert to floats
605 | 
606 | 					// apply color filter
607 | 					pixel_F = _mm_mul_ps(pixel_F, inv_alpha);
608 | 					alpha_f_4x = _mm_mul_ps(pixel_prev, alpha_f_4x);
609 | 					pixel_F = _mm_add_ps(pixel_F, alpha_f_4x);
610 | 
611 | 					// store current factor and color as previous for next cycle
612 | 					_mm_store_ps(line_buffer, alpha_f_prev4);
613 | 					_mm_store_ps(line_buffer + 4, pixel_F);
614 | 					line_buffer += 8;
615 | 
616 | 					// calculate final color
617 | 					pixel_F = _mm_div_ps(pixel_F, alpha_f_prev4);
618 | 
619 | 					// pack float pixel into byte pixel
620 | 					__m128i pixB = _mm_cvtps_epi32(pixel_F); // convert to integer
621 | 					pixB = _mm_shuffle_epi8(pixB, mask_pack);
622 | 
623 | 					out_pix4 = _mm_srli_si128(out_pix4, 4); // shift 
624 | 					out_pix4 = _mm_or_si128(out_pix4, pixB);
625 | 
626 | 				}
627 | 
628 | 				// average result 4x pixel with what is already in destination
629 | 				__m128i dst4 = _mm_loadu_si128(dst_4x);
630 | 				out_pix4 = _mm_avg_epu8(out_pix4, dst4);
631 | 				_mm_storeu_si128(dst_4x++, out_pix4); // store 4x pixel
632 | 			}
633 | 
634 | 			// have to handle leftover 1-3 pixels if last width segment isn't divisble by 4
635 | 			if (width_segment % 4)
636 | 			{
637 | 				// this should be avoided by having image buffers with pitch divisible by 16
638 | 			}
639 | 		}
640 | 	}
641 | 
642 | }
643 | 
644 | bool CRBFilterSSE2::filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch)
645 | {
646 | 	// basic error checking
647 | 	if (!m_stage_buffer[0])
648 | 		return false;
649 | 
650 | 	if (width < 16 || width > m_reserved_width)
651 | 		return false;
652 | 
653 | 	if (height < 16 || height > m_reserved_height)
654 | 		return false;
655 | 
656 | 	if (pitch < width * 4)
657 | 		return false;
658 | 
659 | 	if (!out_data || !in_data)
660 | 		return false;
661 | 
662 | 	if (m_inv_alpha_f == 0.f)
663 | 		return false;
664 | 
665 | 	int thread_count_adjusted = m_thread_count - 1;
666 | 
667 | 	////////////////////////////////////////////// 
668 | 	// horizontal filter divided in threads
669 | 	for (int i = 0; i < thread_count_adjusted; i++)
670 | 	{
671 | 		m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::horizontalFilter, this, i, in_data, m_stage_buffer[0], width, height, pitch);
672 | 	}
673 | 
674 | 	// use this thread for last segment
675 | 	horizontalFilter(thread_count_adjusted, in_data, m_stage_buffer[0], width, height, pitch);
676 | 
677 | 	// wait for result
678 | 	for (int i = 0; i < thread_count_adjusted; i++)
679 | 	{
680 | 		m_horizontal_tasks[i].get();
681 | 	}
682 | 
683 | 	/////////////////////////////////////////////
684 | 	// vertical filter divided in threads
685 | 	for (int i = 0; i < thread_count_adjusted; i++)
686 | 	{
687 | 		m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::verticalFilter, this, i, m_stage_buffer[0], out_data, width, height, pitch);
688 | 	}
689 | 
690 | 	// use this thread for last segment
691 | 	verticalFilter(thread_count_adjusted, m_stage_buffer[0], out_data, width, height, pitch);
692 | 
693 | 	// wait for result
694 | 	for (int i = 0; i < thread_count_adjusted; i++)
695 | 	{
696 | 		m_vertical_tasks[i].get();
697 | 	}
698 | 
699 | 	return true;
700 | }
701 | 
702 | bool CRBFilterSSE2::filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch)
703 | {
704 | 	// basic error checking
705 | 	if (!m_stage_buffer[0])
706 | 		return false;
707 | 
708 | 	if (width < 16 || width > m_reserved_width)
709 | 		return false;
710 | 
711 | 	if (height < 16 || height > m_reserved_height)
712 | 		return false;
713 | 
714 | 	if (pitch < width * 4)
715 | 		return false;
716 | 
717 | 	if (m_inv_alpha_f == 0.f)
718 | 		return false;
719 | 
720 | 	m_image_width = width;
721 | 	m_image_height = height;
722 | 	m_image_pitch = pitch;
723 | 
724 | 	// block until last frame finished 1st stage
725 | 	for (int i = 0; i < m_thread_count; i++)
726 | 	{
727 | 		if (m_horizontal_tasks[i].valid())
728 | 			m_horizontal_tasks[i].get();
729 | 	}
730 | 
731 | 	int previous_stage_index = (m_filter_counter - 1) % STAGE_BUFFER_COUNT;
732 | 	int current_stage_index = m_filter_counter % STAGE_BUFFER_COUNT;
733 | 	m_filter_counter++;
734 | 	m_out_buffer[current_stage_index] = out_data;
735 | 
736 | 	// start new horizontal stage
737 | 	if (in_data)
738 | 	{
739 | 		// start first stage for current frame
740 | 		for (int i = 0; i < m_thread_count; i++)
741 | 		{
742 | 			m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::horizontalFilter, this, i, in_data, m_stage_buffer[current_stage_index], width, height, pitch);
743 | 		}
744 | 	}
745 | 
746 | 	// block until last frame finished 2nd stage
747 | 	for (int i = 0; i < m_thread_count; i++)
748 | 	{
749 | 		if (m_vertical_tasks[i].valid())
750 | 			m_vertical_tasks[i].get();
751 | 	}
752 | 
753 | 	// start new vertical stage based on result of previous stage
754 | 	if (previous_stage_index >= 0 && m_out_buffer[previous_stage_index])
755 | 	{
756 | 		// start first stage for current frame
757 | 		for (int i = 0; i < m_thread_count; i++)
758 | 		{
759 | 			m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterSSE2::verticalFilter, this, i, m_stage_buffer[previous_stage_index], m_out_buffer[previous_stage_index], width, height, pitch);
760 | 		}
761 | 	}
762 | 
763 | 	return true;
764 | }
765 | 
766 | void CRBFilterSSE2::filterPipeFlush()
767 | {
768 | 	filterPipePush(nullptr, nullptr, m_image_width, m_image_height, m_image_pitch);
769 | 
770 | 	if (m_filter_counter > 0)
771 | 	{
772 | 		for (int i = 0; i < m_thread_count; i++)
773 | 		{
774 | 			if(m_vertical_tasks[i].valid())
775 | 				m_vertical_tasks[i].get();
776 | 		}
777 | 	}
778 | 
779 | 	m_filter_counter = 0;
780 | }


--------------------------------------------------------------------------------
/stb_image_write.h:
--------------------------------------------------------------------------------
   1 | /* stb_image_write - v1.05 - public domain - http://nothings.org/stb/stb_image_write.h
   2 |    writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010-2015
   3 |                                      no warranty implied; use at your own risk
   4 | 
   5 |    Before #including,
   6 | 
   7 |        #define STB_IMAGE_WRITE_IMPLEMENTATION
   8 | 
   9 |    in the file that you want to have the implementation.
  10 | 
  11 |    Will probably not work correctly with strict-aliasing optimizations.
  12 | 
  13 | ABOUT:
  14 | 
  15 |    This header file is a library for writing images to C stdio. It could be
  16 |    adapted to write to memory or a general streaming interface; let me know.
  17 | 
  18 |    The PNG output is not optimal; it is 20-50% larger than the file
  19 |    written by a decent optimizing implementation. This library is designed
  20 |    for source code compactness and simplicity, not optimal image file size
  21 |    or run-time performance.
  22 | 
  23 | BUILDING:
  24 | 
  25 |    You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
  26 |    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
  27 |    malloc,realloc,free.
  28 |    You can define STBIW_MEMMOVE() to replace memmove()
  29 | 
  30 | USAGE:
  31 | 
  32 |    There are four functions, one for each image file format:
  33 | 
  34 |      int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
  35 |      int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
  36 |      int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
  37 |      int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
  38 | 
  39 |    There are also four equivalent functions that use an arbitrary write function. You are
  40 |    expected to open/close your file-equivalent before and after calling these:
  41 | 
  42 |      int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
  43 |      int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
  44 |      int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
  45 |      int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
  46 | 
  47 |    where the callback is:
  48 |       void stbi_write_func(void *context, void *data, int size);
  49 | 
  50 |    You can define STBI_WRITE_NO_STDIO to disable the file variant of these
  51 |    functions, so the library will not use stdio.h at all. However, this will
  52 |    also disable HDR writing, because it requires stdio for formatted output.
  53 | 
  54 |    Each function returns 0 on failure and non-0 on success.
  55 | 
  56 |    The functions create an image file defined by the parameters. The image
  57 |    is a rectangle of pixels stored from left-to-right, top-to-bottom.
  58 |    Each pixel contains 'comp' channels of data stored interleaved with 8-bits
  59 |    per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
  60 |    monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
  61 |    The *data pointer points to the first byte of the top-left-most pixel.
  62 |    For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
  63 |    a row of pixels to the first byte of the next row of pixels.
  64 | 
  65 |    PNG creates output files with the same number of components as the input.
  66 |    The BMP format expands Y to RGB in the file format and does not
  67 |    output alpha.
  68 | 
  69 |    PNG supports writing rectangles of data even when the bytes storing rows of
  70 |    data are not consecutive in memory (e.g. sub-rectangles of a larger image),
  71 |    by supplying the stride between the beginning of adjacent rows. The other
  72 |    formats do not. (Thus you cannot write a native-format BMP through the BMP
  73 |    writer, both because it is in BGR order and because it may have padding
  74 |    at the end of the line.)
  75 | 
  76 |    HDR expects linear float data. Since the format is always 32-bit rgb(e)
  77 |    data, alpha (if provided) is discarded, and for monochrome data it is
  78 |    replicated across all three channels.
  79 | 
  80 |    TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
  81 |    data, set the global variable 'stbi_write_tga_with_rle' to 0.
  82 | 
  83 | CREDITS:
  84 | 
  85 |    PNG/BMP/TGA
  86 |       Sean Barrett
  87 |    HDR
  88 |       Baldur Karlsson
  89 |    TGA monochrome:
  90 |       Jean-Sebastien Guay
  91 |    misc enhancements:
  92 |       Tim Kelsey
  93 |    TGA RLE
  94 |       Alan Hickman
  95 |    initial file IO callback implementation
  96 |       Emmanuel Julien
  97 |    bugfixes:
  98 |       github:Chribba
  99 |       Guillaume Chereau
 100 |       github:jry2
 101 |       github:romigrou
 102 |       Sergio Gonzalez
 103 |       Jonas Karlsson
 104 |       Filip Wasil
 105 |       Thatcher Ulrich
 106 |       github:poppolopoppo
 107 |       Patrick Boettcher
 108 |       
 109 | LICENSE
 110 | 
 111 |   See end of file for license information.
 112 | 
 113 | */
 114 | 
 115 | #ifndef INCLUDE_STB_IMAGE_WRITE_H
 116 | #define INCLUDE_STB_IMAGE_WRITE_H
 117 | 
 118 | #ifdef __cplusplus
 119 | extern "C" {
 120 | #endif
 121 | 
 122 | #ifdef STB_IMAGE_WRITE_STATIC
 123 | #define STBIWDEF static
 124 | #else
 125 | #define STBIWDEF extern
 126 | extern int stbi_write_tga_with_rle;
 127 | #endif
 128 | 
 129 | #ifndef STBI_WRITE_NO_STDIO
 130 | STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
 131 | STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
 132 | STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
 133 | STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
 134 | #endif
 135 | 
 136 | typedef void stbi_write_func(void *context, void *data, int size);
 137 | 
 138 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
 139 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
 140 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
 141 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
 142 | 
 143 | #ifdef __cplusplus
 144 | }
 145 | #endif
 146 | 
 147 | #endif//INCLUDE_STB_IMAGE_WRITE_H
 148 | 
 149 | #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 150 | 
 151 | #ifdef _WIN32
 152 |    #ifndef _CRT_SECURE_NO_WARNINGS
 153 |    #define _CRT_SECURE_NO_WARNINGS
 154 |    #endif
 155 |    #ifndef _CRT_NONSTDC_NO_DEPRECATE
 156 |    #define _CRT_NONSTDC_NO_DEPRECATE
 157 |    #endif
 158 | #endif
 159 | 
 160 | #ifndef STBI_WRITE_NO_STDIO
 161 | #include <stdio.h>
 162 | #endif // STBI_WRITE_NO_STDIO
 163 | 
 164 | #include <stdarg.h>
 165 | #include <stdlib.h>
 166 | #include <string.h>
 167 | #include <math.h>
 168 | 
 169 | #if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 170 | // ok
 171 | #elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
 172 | // ok
 173 | #else
 174 | #error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 175 | #endif
 176 | 
 177 | #ifndef STBIW_MALLOC
 178 | #define STBIW_MALLOC(sz)        malloc(sz)
 179 | #define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
 180 | #define STBIW_FREE(p)           free(p)
 181 | #endif
 182 | 
 183 | #ifndef STBIW_REALLOC_SIZED
 184 | #define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
 185 | #endif
 186 | 
 187 | 
 188 | #ifndef STBIW_MEMMOVE
 189 | #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
 190 | #endif
 191 | 
 192 | 
 193 | #ifndef STBIW_ASSERT
 194 | #include <assert.h>
 195 | #define STBIW_ASSERT(x) assert(x)
 196 | #endif
 197 | 
 198 | #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
 199 | 
 200 | typedef struct
 201 | {
 202 |    stbi_write_func *func;
 203 |    void *context;
 204 | } stbi__write_context;
 205 | 
 206 | // initialize a callback-based context
 207 | static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
 208 | {
 209 |    s->func    = c;
 210 |    s->context = context;
 211 | }
 212 | 
 213 | #ifndef STBI_WRITE_NO_STDIO
 214 | 
 215 | static void stbi__stdio_write(void *context, void *data, int size)
 216 | {
 217 |    fwrite(data,1,size,(FILE*) context);
 218 | }
 219 | 
 220 | static int stbi__start_write_file(stbi__write_context *s, const char *filename)
 221 | {
 222 |    FILE *f = fopen(filename, "wb");
 223 |    stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
 224 |    return f != NULL;
 225 | }
 226 | 
 227 | static void stbi__end_write_file(stbi__write_context *s)
 228 | {
 229 |    fclose((FILE *)s->context);
 230 | }
 231 | 
 232 | #endif // !STBI_WRITE_NO_STDIO
 233 | 
 234 | typedef unsigned int stbiw_uint32;
 235 | typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
 236 | 
 237 | #ifdef STB_IMAGE_WRITE_STATIC
 238 | static int stbi_write_tga_with_rle = 1;
 239 | #else
 240 | int stbi_write_tga_with_rle = 1;
 241 | #endif
 242 | 
 243 | static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
 244 | {
 245 |    while (*fmt) {
 246 |       switch (*fmt++) {
 247 |          case ' ': break;
 248 |          case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
 249 |                      s->func(s->context,&x,1);
 250 |                      break; }
 251 |          case '2': { int x = va_arg(v,int);
 252 |                      unsigned char b[2];
 253 |                      b[0] = STBIW_UCHAR(x);
 254 |                      b[1] = STBIW_UCHAR(x>>8);
 255 |                      s->func(s->context,b,2);
 256 |                      break; }
 257 |          case '4': { stbiw_uint32 x = va_arg(v,int);
 258 |                      unsigned char b[4];
 259 |                      b[0]=STBIW_UCHAR(x);
 260 |                      b[1]=STBIW_UCHAR(x>>8);
 261 |                      b[2]=STBIW_UCHAR(x>>16);
 262 |                      b[3]=STBIW_UCHAR(x>>24);
 263 |                      s->func(s->context,b,4);
 264 |                      break; }
 265 |          default:
 266 |             STBIW_ASSERT(0);
 267 |             return;
 268 |       }
 269 |    }
 270 | }
 271 | 
 272 | static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
 273 | {
 274 |    va_list v;
 275 |    va_start(v, fmt);
 276 |    stbiw__writefv(s, fmt, v);
 277 |    va_end(v);
 278 | }
 279 | 
 280 | static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 281 | {
 282 |    unsigned char arr[3];
 283 |    arr[0] = a, arr[1] = b, arr[2] = c;
 284 |    s->func(s->context, arr, 3);
 285 | }
 286 | 
 287 | static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
 288 | {
 289 |    unsigned char bg[3] = { 255, 0, 255}, px[3];
 290 |    int k;
 291 | 
 292 |    if (write_alpha < 0)
 293 |       s->func(s->context, &d[comp - 1], 1);
 294 | 
 295 |    switch (comp) {
 296 |       case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
 297 |       case 1:
 298 |          if (expand_mono)
 299 |             stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
 300 |          else
 301 |             s->func(s->context, d, 1);  // monochrome TGA
 302 |          break;
 303 |       case 4:
 304 |          if (!write_alpha) {
 305 |             // composite against pink background
 306 |             for (k = 0; k < 3; ++k)
 307 |                px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
 308 |             stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
 309 |             break;
 310 |          }
 311 |          /* FALLTHROUGH */
 312 |       case 3:
 313 |          stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
 314 |          break;
 315 |    }
 316 |    if (write_alpha > 0)
 317 |       s->func(s->context, &d[comp - 1], 1);
 318 | }
 319 | 
 320 | static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
 321 | {
 322 |    stbiw_uint32 zero = 0;
 323 |    int i,j, j_end;
 324 | 
 325 |    if (y <= 0)
 326 |       return;
 327 | 
 328 |    if (vdir < 0)
 329 |       j_end = -1, j = y-1;
 330 |    else
 331 |       j_end =  y, j = 0;
 332 | 
 333 |    for (; j != j_end; j += vdir) {
 334 |       for (i=0; i < x; ++i) {
 335 |          unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
 336 |          stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
 337 |       }
 338 |       s->func(s->context, &zero, scanline_pad);
 339 |    }
 340 | }
 341 | 
 342 | static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
 343 | {
 344 |    if (y < 0 || x < 0) {
 345 |       return 0;
 346 |    } else {
 347 |       va_list v;
 348 |       va_start(v, fmt);
 349 |       stbiw__writefv(s, fmt, v);
 350 |       va_end(v);
 351 |       stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
 352 |       return 1;
 353 |    }
 354 | }
 355 | 
 356 | static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 357 | {
 358 |    int pad = (-x*3) & 3;
 359 |    return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
 360 |            "11 4 22 4" "4 44 22 444444",
 361 |            'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
 362 |             40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
 363 | }
 364 | 
 365 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 366 | {
 367 |    stbi__write_context s;
 368 |    stbi__start_write_callbacks(&s, func, context);
 369 |    return stbi_write_bmp_core(&s, x, y, comp, data);
 370 | }
 371 | 
 372 | #ifndef STBI_WRITE_NO_STDIO
 373 | STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
 374 | {
 375 |    stbi__write_context s;
 376 |    if (stbi__start_write_file(&s,filename)) {
 377 |       int r = stbi_write_bmp_core(&s, x, y, comp, data);
 378 |       stbi__end_write_file(&s);
 379 |       return r;
 380 |    } else
 381 |       return 0;
 382 | }
 383 | #endif //!STBI_WRITE_NO_STDIO
 384 | 
 385 | static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
 386 | {
 387 |    int has_alpha = (comp == 2 || comp == 4);
 388 |    int colorbytes = has_alpha ? comp-1 : comp;
 389 |    int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
 390 | 
 391 |    if (y < 0 || x < 0)
 392 |       return 0;
 393 | 
 394 |    if (!stbi_write_tga_with_rle) {
 395 |       return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
 396 |          "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
 397 |    } else {
 398 |       int i,j,k;
 399 | 
 400 |       stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
 401 | 
 402 |       for (j = y - 1; j >= 0; --j) {
 403 |           unsigned char *row = (unsigned char *) data + j * x * comp;
 404 |          int len;
 405 | 
 406 |          for (i = 0; i < x; i += len) {
 407 |             unsigned char *begin = row + i * comp;
 408 |             int diff = 1;
 409 |             len = 1;
 410 | 
 411 |             if (i < x - 1) {
 412 |                ++len;
 413 |                diff = memcmp(begin, row + (i + 1) * comp, comp);
 414 |                if (diff) {
 415 |                   const unsigned char *prev = begin;
 416 |                   for (k = i + 2; k < x && len < 128; ++k) {
 417 |                      if (memcmp(prev, row + k * comp, comp)) {
 418 |                         prev += comp;
 419 |                         ++len;
 420 |                      } else {
 421 |                         --len;
 422 |                         break;
 423 |                      }
 424 |                   }
 425 |                } else {
 426 |                   for (k = i + 2; k < x && len < 128; ++k) {
 427 |                      if (!memcmp(begin, row + k * comp, comp)) {
 428 |                         ++len;
 429 |                      } else {
 430 |                         break;
 431 |                      }
 432 |                   }
 433 |                }
 434 |             }
 435 | 
 436 |             if (diff) {
 437 |                unsigned char header = STBIW_UCHAR(len - 1);
 438 |                s->func(s->context, &header, 1);
 439 |                for (k = 0; k < len; ++k) {
 440 |                   stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
 441 |                }
 442 |             } else {
 443 |                unsigned char header = STBIW_UCHAR(len - 129);
 444 |                s->func(s->context, &header, 1);
 445 |                stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
 446 |             }
 447 |          }
 448 |       }
 449 |    }
 450 |    return 1;
 451 | }
 452 | 
 453 | int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 454 | {
 455 |    stbi__write_context s;
 456 |    stbi__start_write_callbacks(&s, func, context);
 457 |    return stbi_write_tga_core(&s, x, y, comp, (void *) data);
 458 | }
 459 | 
 460 | #ifndef STBI_WRITE_NO_STDIO
 461 | int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
 462 | {
 463 |    stbi__write_context s;
 464 |    if (stbi__start_write_file(&s,filename)) {
 465 |       int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
 466 |       stbi__end_write_file(&s);
 467 |       return r;
 468 |    } else
 469 |       return 0;
 470 | }
 471 | #endif
 472 | 
 473 | // *************************************************************************************************
 474 | // Radiance RGBE HDR writer
 475 | // by Baldur Karlsson
 476 | 
 477 | #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 478 | 
 479 | void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 480 | {
 481 |    int exponent;
 482 |    float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 483 | 
 484 |    if (maxcomp < 1e-32f) {
 485 |       rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
 486 |    } else {
 487 |       float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
 488 | 
 489 |       rgbe[0] = (unsigned char)(linear[0] * normalize);
 490 |       rgbe[1] = (unsigned char)(linear[1] * normalize);
 491 |       rgbe[2] = (unsigned char)(linear[2] * normalize);
 492 |       rgbe[3] = (unsigned char)(exponent + 128);
 493 |    }
 494 | }
 495 | 
 496 | void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
 497 | {
 498 |    unsigned char lengthbyte = STBIW_UCHAR(length+128);
 499 |    STBIW_ASSERT(length+128 <= 255);
 500 |    s->func(s->context, &lengthbyte, 1);
 501 |    s->func(s->context, &databyte, 1);
 502 | }
 503 | 
 504 | void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
 505 | {
 506 |    unsigned char lengthbyte = STBIW_UCHAR(length);
 507 |    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
 508 |    s->func(s->context, &lengthbyte, 1);
 509 |    s->func(s->context, data, length);
 510 | }
 511 | 
 512 | void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
 513 | {
 514 |    unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
 515 |    unsigned char rgbe[4];
 516 |    float linear[3];
 517 |    int x;
 518 | 
 519 |    scanlineheader[2] = (width&0xff00)>>8;
 520 |    scanlineheader[3] = (width&0x00ff);
 521 | 
 522 |    /* skip RLE for images too small or large */
 523 |    if (width < 8 || width >= 32768) {
 524 |       for (x=0; x < width; x++) {
 525 |          switch (ncomp) {
 526 |             case 4: /* fallthrough */
 527 |             case 3: linear[2] = scanline[x*ncomp + 2];
 528 |                     linear[1] = scanline[x*ncomp + 1];
 529 |                     linear[0] = scanline[x*ncomp + 0];
 530 |                     break;
 531 |             default:
 532 |                     linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
 533 |                     break;
 534 |          }
 535 |          stbiw__linear_to_rgbe(rgbe, linear);
 536 |          s->func(s->context, rgbe, 4);
 537 |       }
 538 |    } else {
 539 |       int c,r;
 540 |       /* encode into scratch buffer */
 541 |       for (x=0; x < width; x++) {
 542 |          switch(ncomp) {
 543 |             case 4: /* fallthrough */
 544 |             case 3: linear[2] = scanline[x*ncomp + 2];
 545 |                     linear[1] = scanline[x*ncomp + 1];
 546 |                     linear[0] = scanline[x*ncomp + 0];
 547 |                     break;
 548 |             default:
 549 |                     linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
 550 |                     break;
 551 |          }
 552 |          stbiw__linear_to_rgbe(rgbe, linear);
 553 |          scratch[x + width*0] = rgbe[0];
 554 |          scratch[x + width*1] = rgbe[1];
 555 |          scratch[x + width*2] = rgbe[2];
 556 |          scratch[x + width*3] = rgbe[3];
 557 |       }
 558 | 
 559 |       s->func(s->context, scanlineheader, 4);
 560 | 
 561 |       /* RLE each component separately */
 562 |       for (c=0; c < 4; c++) {
 563 |          unsigned char *comp = &scratch[width*c];
 564 | 
 565 |          x = 0;
 566 |          while (x < width) {
 567 |             // find first run
 568 |             r = x;
 569 |             while (r+2 < width) {
 570 |                if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
 571 |                   break;
 572 |                ++r;
 573 |             }
 574 |             if (r+2 >= width)
 575 |                r = width;
 576 |             // dump up to first run
 577 |             while (x < r) {
 578 |                int len = r-x;
 579 |                if (len > 128) len = 128;
 580 |                stbiw__write_dump_data(s, len, &comp[x]);
 581 |                x += len;
 582 |             }
 583 |             // if there's a run, output it
 584 |             if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
 585 |                // find next byte after run
 586 |                while (r < width && comp[r] == comp[x])
 587 |                   ++r;
 588 |                // output run up to r
 589 |                while (x < r) {
 590 |                   int len = r-x;
 591 |                   if (len > 127) len = 127;
 592 |                   stbiw__write_run_data(s, len, comp[x]);
 593 |                   x += len;
 594 |                }
 595 |             }
 596 |          }
 597 |       }
 598 |    }
 599 | }
 600 | 
 601 | static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
 602 | {
 603 |    if (y <= 0 || x <= 0 || data == NULL)
 604 |       return 0;
 605 |    else {
 606 |       // Each component is stored separately. Allocate scratch space for full output scanline.
 607 |       unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
 608 |       int i, len;
 609 |       char buffer[128];
 610 |       char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
 611 |       s->func(s->context, header, sizeof(header)-1);
 612 | 
 613 |       len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 614 |       s->func(s->context, buffer, len);
 615 | 
 616 |       for(i=0; i < y; i++)
 617 |          stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*i*x);
 618 |       STBIW_FREE(scratch);
 619 |       return 1;
 620 |    }
 621 | }
 622 | 
 623 | int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
 624 | {
 625 |    stbi__write_context s;
 626 |    stbi__start_write_callbacks(&s, func, context);
 627 |    return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
 628 | }
 629 | 
 630 | #ifndef STBI_WRITE_NO_STDIO
 631 | int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
 632 | {
 633 |    stbi__write_context s;
 634 |    if (stbi__start_write_file(&s,filename)) {
 635 |       int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
 636 |       stbi__end_write_file(&s);
 637 |       return r;
 638 |    } else
 639 |       return 0;
 640 | }
 641 | #endif // STBI_WRITE_NO_STDIO
 642 | 
 643 | 
 644 | //////////////////////////////////////////////////////////////////////////////
 645 | //
 646 | // PNG writer
 647 | //
 648 | 
 649 | // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
 650 | #define stbiw__sbraw(a) ((int *) (a) - 2)
 651 | #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
 652 | #define stbiw__sbn(a)   stbiw__sbraw(a)[1]
 653 | 
 654 | #define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
 655 | #define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
 656 | #define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
 657 | 
 658 | #define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
 659 | #define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
 660 | #define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
 661 | 
 662 | static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 663 | {
 664 |    int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
 665 |    void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
 666 |    STBIW_ASSERT(p);
 667 |    if (p) {
 668 |       if (!*arr) ((int *) p)[1] = 0;
 669 |       *arr = (void *) ((int *) p + 2);
 670 |       stbiw__sbm(*arr) = m;
 671 |    }
 672 |    return *arr;
 673 | }
 674 | 
 675 | static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
 676 | {
 677 |    while (*bitcount >= 8) {
 678 |       stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
 679 |       *bitbuffer >>= 8;
 680 |       *bitcount -= 8;
 681 |    }
 682 |    return data;
 683 | }
 684 | 
 685 | static int stbiw__zlib_bitrev(int code, int codebits)
 686 | {
 687 |    int res=0;
 688 |    while (codebits--) {
 689 |       res = (res << 1) | (code & 1);
 690 |       code >>= 1;
 691 |    }
 692 |    return res;
 693 | }
 694 | 
 695 | static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
 696 | {
 697 |    int i;
 698 |    for (i=0; i < limit && i < 258; ++i)
 699 |       if (a[i] != b[i]) break;
 700 |    return i;
 701 | }
 702 | 
 703 | static unsigned int stbiw__zhash(unsigned char *data)
 704 | {
 705 |    stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
 706 |    hash ^= hash << 3;
 707 |    hash += hash >> 5;
 708 |    hash ^= hash << 4;
 709 |    hash += hash >> 17;
 710 |    hash ^= hash << 25;
 711 |    hash += hash >> 6;
 712 |    return hash;
 713 | }
 714 | 
 715 | #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
 716 | #define stbiw__zlib_add(code,codebits) \
 717 |       (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
 718 | #define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
 719 | // default huffman tables
 720 | #define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
 721 | #define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
 722 | #define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
 723 | #define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
 724 | #define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
 725 | #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
 726 | 
 727 | #define stbiw__ZHASH   16384
 728 | 
 729 | unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
 730 | {
 731 |    static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
 732 |    static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
 733 |    static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
 734 |    static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
 735 |    unsigned int bitbuf=0;
 736 |    int i,j, bitcount=0;
 737 |    unsigned char *out = NULL;
 738 |    unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
 739 |    if (quality < 5) quality = 5;
 740 | 
 741 |    stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
 742 |    stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
 743 |    stbiw__zlib_add(1,1);  // BFINAL = 1
 744 |    stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
 745 | 
 746 |    for (i=0; i < stbiw__ZHASH; ++i)
 747 |       hash_table[i] = NULL;
 748 | 
 749 |    i=0;
 750 |    while (i < data_len-3) {
 751 |       // hash next 3 bytes of data to be compressed
 752 |       int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
 753 |       unsigned char *bestloc = 0;
 754 |       unsigned char **hlist = hash_table[h];
 755 |       int n = stbiw__sbcount(hlist);
 756 |       for (j=0; j < n; ++j) {
 757 |          if (hlist[j]-data > i-32768) { // if entry lies within window
 758 |             int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
 759 |             if (d >= best) best=d,bestloc=hlist[j];
 760 |          }
 761 |       }
 762 |       // when hash table entry is too long, delete half the entries
 763 |       if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
 764 |          STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
 765 |          stbiw__sbn(hash_table[h]) = quality;
 766 |       }
 767 |       stbiw__sbpush(hash_table[h],data+i);
 768 | 
 769 |       if (bestloc) {
 770 |          // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
 771 |          h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
 772 |          hlist = hash_table[h];
 773 |          n = stbiw__sbcount(hlist);
 774 |          for (j=0; j < n; ++j) {
 775 |             if (hlist[j]-data > i-32767) {
 776 |                int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
 777 |                if (e > best) { // if next match is better, bail on current match
 778 |                   bestloc = NULL;
 779 |                   break;
 780 |                }
 781 |             }
 782 |          }
 783 |       }
 784 | 
 785 |       if (bestloc) {
 786 |          int d = (int) (data+i - bestloc); // distance back
 787 |          STBIW_ASSERT(d <= 32767 && best <= 258);
 788 |          for (j=0; best > lengthc[j+1]-1; ++j);
 789 |          stbiw__zlib_huff(j+257);
 790 |          if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
 791 |          for (j=0; d > distc[j+1]-1; ++j);
 792 |          stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
 793 |          if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
 794 |          i += best;
 795 |       } else {
 796 |          stbiw__zlib_huffb(data[i]);
 797 |          ++i;
 798 |       }
 799 |    }
 800 |    // write out final bytes
 801 |    for (;i < data_len; ++i)
 802 |       stbiw__zlib_huffb(data[i]);
 803 |    stbiw__zlib_huff(256); // end of block
 804 |    // pad with 0 bits to byte boundary
 805 |    while (bitcount)
 806 |       stbiw__zlib_add(0,1);
 807 | 
 808 |    for (i=0; i < stbiw__ZHASH; ++i)
 809 |       (void) stbiw__sbfree(hash_table[i]);
 810 |    STBIW_FREE(hash_table);
 811 | 
 812 |    {
 813 |       // compute adler32 on input
 814 |       unsigned int s1=1, s2=0;
 815 |       int blocklen = (int) (data_len % 5552);
 816 |       j=0;
 817 |       while (j < data_len) {
 818 |          for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
 819 |          s1 %= 65521, s2 %= 65521;
 820 |          j += blocklen;
 821 |          blocklen = 5552;
 822 |       }
 823 |       stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
 824 |       stbiw__sbpush(out, STBIW_UCHAR(s2));
 825 |       stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
 826 |       stbiw__sbpush(out, STBIW_UCHAR(s1));
 827 |    }
 828 |    *out_len = stbiw__sbn(out);
 829 |    // make returned pointer freeable
 830 |    STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
 831 |    return (unsigned char *) stbiw__sbraw(out);
 832 | }
 833 | 
 834 | static unsigned int stbiw__crc32(unsigned char *buffer, int len)
 835 | {
 836 |    static unsigned int crc_table[256] =
 837 |    {
 838 |       0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
 839 |       0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
 840 |       0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
 841 |       0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
 842 |       0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
 843 |       0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
 844 |       0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
 845 |       0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
 846 |       0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
 847 |       0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
 848 |       0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
 849 |       0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
 850 |       0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
 851 |       0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
 852 |       0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
 853 |       0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
 854 |       0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
 855 |       0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
 856 |       0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
 857 |       0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
 858 |       0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
 859 |       0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
 860 |       0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
 861 |       0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
 862 |       0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
 863 |       0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
 864 |       0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
 865 |       0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
 866 |       0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
 867 |       0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
 868 |       0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
 869 |       0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
 870 |    };
 871 | 
 872 |    unsigned int crc = ~0u;
 873 |    int i;
 874 |    for (i=0; i < len; ++i)
 875 |       crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
 876 |    return ~crc;
 877 | }
 878 | 
 879 | #define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
 880 | #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
 881 | #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
 882 | 
 883 | static void stbiw__wpcrc(unsigned char **data, int len)
 884 | {
 885 |    unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
 886 |    stbiw__wp32(*data, crc);
 887 | }
 888 | 
 889 | static unsigned char stbiw__paeth(int a, int b, int c)
 890 | {
 891 |    int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
 892 |    if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
 893 |    if (pb <= pc) return STBIW_UCHAR(b);
 894 |    return STBIW_UCHAR(c);
 895 | }
 896 | 
 897 | // @OPTIMIZE: provide an option that always forces left-predict or paeth predict
 898 | unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
 899 | {
 900 |    int ctype[5] = { -1, 0, 4, 2, 6 };
 901 |    unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
 902 |    unsigned char *out,*o, *filt, *zlib;
 903 |    signed char *line_buffer;
 904 |    int i,j,k,p,zlen;
 905 | 
 906 |    if (stride_bytes == 0)
 907 |       stride_bytes = x * n;
 908 | 
 909 |    filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
 910 |    line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
 911 |    for (j=0; j < y; ++j) {
 912 |       static int mapping[] = { 0,1,2,3,4 };
 913 |       static int firstmap[] = { 0,1,0,5,6 };
 914 |       int *mymap = (j != 0) ? mapping : firstmap;
 915 |       int best = 0, bestval = 0x7fffffff;
 916 |       for (p=0; p < 2; ++p) {
 917 |          for (k= p?best:0; k < 5; ++k) { // @TODO: clarity: rewrite this to go 0..5, and 'continue' the unwanted ones during 2nd pass
 918 |             int type = mymap[k],est=0;
 919 |             unsigned char *z = pixels + stride_bytes*j;
 920 |             for (i=0; i < n; ++i)
 921 |                switch (type) {
 922 |                   case 0: line_buffer[i] = z[i]; break;
 923 |                   case 1: line_buffer[i] = z[i]; break;
 924 |                   case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
 925 |                   case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break;
 926 |                   case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break;
 927 |                   case 5: line_buffer[i] = z[i]; break;
 928 |                   case 6: line_buffer[i] = z[i]; break;
 929 |                }
 930 |             for (i=n; i < x*n; ++i) {
 931 |                switch (type) {
 932 |                   case 0: line_buffer[i] = z[i]; break;
 933 |                   case 1: line_buffer[i] = z[i] - z[i-n]; break;
 934 |                   case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
 935 |                   case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break;
 936 |                   case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break;
 937 |                   case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
 938 |                   case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
 939 |                }
 940 |             }
 941 |             if (p) break;
 942 |             for (i=0; i < x*n; ++i)
 943 |                est += abs((signed char) line_buffer[i]);
 944 |             if (est < bestval) { bestval = est; best = k; }
 945 |          }
 946 |       }
 947 |       // when we get here, best contains the filter type, and line_buffer contains the data
 948 |       filt[j*(x*n+1)] = (unsigned char) best;
 949 |       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
 950 |    }
 951 |    STBIW_FREE(line_buffer);
 952 |    zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory
 953 |    STBIW_FREE(filt);
 954 |    if (!zlib) return 0;
 955 | 
 956 |    // each tag requires 12 bytes of overhead
 957 |    out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
 958 |    if (!out) return 0;
 959 |    *out_len = 8 + 12+13 + 12+zlen + 12;
 960 | 
 961 |    o=out;
 962 |    STBIW_MEMMOVE(o,sig,8); o+= 8;
 963 |    stbiw__wp32(o, 13); // header length
 964 |    stbiw__wptag(o, "IHDR");
 965 |    stbiw__wp32(o, x);
 966 |    stbiw__wp32(o, y);
 967 |    *o++ = 8;
 968 |    *o++ = STBIW_UCHAR(ctype[n]);
 969 |    *o++ = 0;
 970 |    *o++ = 0;
 971 |    *o++ = 0;
 972 |    stbiw__wpcrc(&o,13);
 973 | 
 974 |    stbiw__wp32(o, zlen);
 975 |    stbiw__wptag(o, "IDAT");
 976 |    STBIW_MEMMOVE(o, zlib, zlen);
 977 |    o += zlen;
 978 |    STBIW_FREE(zlib);
 979 |    stbiw__wpcrc(&o, zlen);
 980 | 
 981 |    stbiw__wp32(o,0);
 982 |    stbiw__wptag(o, "IEND");
 983 |    stbiw__wpcrc(&o,0);
 984 | 
 985 |    STBIW_ASSERT(o == out + *out_len);
 986 | 
 987 |    return out;
 988 | }
 989 | 
 990 | #ifndef STBI_WRITE_NO_STDIO
 991 | STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
 992 | {
 993 |    FILE *f;
 994 |    int len;
 995 |    unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
 996 |    if (png == NULL) return 0;
 997 |    f = fopen(filename, "wb");
 998 |    if (!f) { STBIW_FREE(png); return 0; }
 999 |    fwrite(png, 1, len, f);
1000 |    fclose(f);
1001 |    STBIW_FREE(png);
1002 |    return 1;
1003 | }
1004 | #endif
1005 | 
1006 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
1007 | {
1008 |    int len;
1009 |    unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
1010 |    if (png == NULL) return 0;
1011 |    func(context, png, len);
1012 |    STBIW_FREE(png);
1013 |    return 1;
1014 | }
1015 | 
1016 | #endif // STB_IMAGE_WRITE_IMPLEMENTATION
1017 | 
1018 | /* Revision history
1019 |       1.04 (2017-03-03)
1020 |              monochrome BMP expansion
1021 |       1.03   ???
1022 |       1.02 (2016-04-02)
1023 |              avoid allocating large structures on the stack
1024 |       1.01 (2016-01-16)
1025 |              STBIW_REALLOC_SIZED: support allocators with no realloc support
1026 |              avoid race-condition in crc initialization
1027 |              minor compile issues
1028 |       1.00 (2015-09-14)
1029 |              installable file IO function
1030 |       0.99 (2015-09-13)
1031 |              warning fixes; TGA rle support
1032 |       0.98 (2015-04-08)
1033 |              added STBIW_MALLOC, STBIW_ASSERT etc
1034 |       0.97 (2015-01-18)
1035 |              fixed HDR asserts, rewrote HDR rle logic
1036 |       0.96 (2015-01-17)
1037 |              add HDR output
1038 |              fix monochrome BMP
1039 |       0.95 (2014-08-17)
1040 | 		       add monochrome TGA output
1041 |       0.94 (2014-05-31)
1042 |              rename private functions to avoid conflicts with stb_image.h
1043 |       0.93 (2014-05-27)
1044 |              warning fixes
1045 |       0.92 (2010-08-01)
1046 |              casts to unsigned char to fix warnings
1047 |       0.91 (2010-07-17)
1048 |              first public release
1049 |       0.90   first internal release
1050 | */
1051 | 
1052 | /*
1053 | ------------------------------------------------------------------------------
1054 | This software is available under 2 licenses -- choose whichever you prefer.
1055 | ------------------------------------------------------------------------------
1056 | ALTERNATIVE A - MIT License
1057 | Copyright (c) 2017 Sean Barrett
1058 | Permission is hereby granted, free of charge, to any person obtaining a copy of 
1059 | this software and associated documentation files (the "Software"), to deal in 
1060 | the Software without restriction, including without limitation the rights to 
1061 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
1062 | of the Software, and to permit persons to whom the Software is furnished to do 
1063 | so, subject to the following conditions:
1064 | The above copyright notice and this permission notice shall be included in all 
1065 | copies or substantial portions of the Software.
1066 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
1067 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
1068 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
1069 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
1070 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
1071 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
1072 | SOFTWARE.
1073 | ------------------------------------------------------------------------------
1074 | ALTERNATIVE B - Public Domain (www.unlicense.org)
1075 | This is free and unencumbered software released into the public domain.
1076 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
1077 | software, either in source code form or as a compiled binary, for any purpose, 
1078 | commercial or non-commercial, and by any means.
1079 | In jurisdictions that recognize copyright laws, the author or authors of this 
1080 | software dedicate any and all copyright interest in the software to the public 
1081 | domain. We make this dedication for the benefit of the public at large and to 
1082 | the detriment of our heirs and successors. We intend this dedication to be an 
1083 | overt act of relinquishment in perpetuity of all present and future rights to 
1084 | this software under copyright law.
1085 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
1086 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
1087 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
1088 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
1089 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
1090 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1091 | ------------------------------------------------------------------------------
1092 | */
1093 | 


--------------------------------------------------------------------------------
/RBFilter_AVX2.cpp:
--------------------------------------------------------------------------------
   1 | #include "stdafx.h"
   2 | #include "RBFilter_AVX2.h"
   3 | #include <math.h>
   4 | #include <malloc.h>
   5 | #include <new>  
   6 | #include <emmintrin.h>
   7 | #include <tmmintrin.h>
   8 | #include <immintrin.h>
   9 | #include <smmintrin.h>
  10 | #include <thread>
  11 | 
  12 | 
  13 | #define MAX_RANGE_TABLE_SIZE 255
  14 | #define ALIGN_SIZE 32
  15 | 
  16 | // only 1 of following 2 should be defined
  17 | #define EDGE_COLOR_USE_MAXIMUM
  18 | //#define EDGE_COLOR_USE_ADDITION
  19 | 
  20 | // if EDGE_COLOR_USE_MAXIMUM is defined, then edge color detection works by calculating
  21 | // maximum difference among 3 components (RGB) of 2 colors, which tends to result in lower differences (since only largest among 3 is selected)
  22 | // if EDGE_COLOR_USE_ADDITION is defined, then edge color detection works by calculating
  23 | // sum of all 3 components, while enforcing 255 maximum. This method is much more sensitive to small differences 
  24 | 
  25 | #if defined(EDGE_COLOR_USE_MAXIMUM) && defined(EDGE_COLOR_USE_ADDITION)
  26 | #error Only 1 of those can be defined
  27 | #endif
  28 | 
  29 | #if !defined(EDGE_COLOR_USE_MAXIMUM) && !defined(EDGE_COLOR_USE_ADDITION)
  30 | #error 1 of those must be defined
  31 | #endif
  32 | 
  33 | CRBFilterAVX2::CRBFilterAVX2()
  34 | {
  35 | 	m_range_table = new float[MAX_RANGE_TABLE_SIZE + 1];
  36 | 	memset(m_range_table, 0, (MAX_RANGE_TABLE_SIZE + 1) * sizeof(float));
  37 | }
  38 | 
  39 | CRBFilterAVX2::~CRBFilterAVX2()
  40 | {
  41 | 	release();
  42 | 
  43 | 	delete[] m_range_table;
  44 | }
  45 | 
  46 | bool CRBFilterAVX2::initialize(int width, int height, int thread_count, bool pipelined)
  47 | {
  48 | 	// basic sanity check, not strict
  49 | 	if (width < 16 || width > 10000)
  50 | 		return false;
  51 | 
  52 | 	if (height < 2 || height > 10000)
  53 | 		return false;
  54 | 
  55 | 	if (thread_count < 1 || thread_count > RBF_MAX_THREADS)
  56 | 		return false;
  57 | 	
  58 | 	release();
  59 | 
  60 | 	m_thread_count = thread_count;
  61 | 
  62 | 	// round height to nearest even number
  63 | 	if (height & 1)
  64 | 		height++;
  65 | 
  66 | 	m_reserved_width = getOptimalPitch(width) / 4;
  67 | 	m_reserved_height = height;
  68 | 	
  69 | 
  70 | 	m_stage_buffer[0] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE);
  71 | 	if (!m_stage_buffer[0])
  72 | 		return false;
  73 | 
  74 | 	if (pipelined)
  75 | 	{
  76 | 		for (int i = 1; i < STAGE_BUFFER_COUNT; i++)
  77 | 		{
  78 | 			m_stage_buffer[i] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE);
  79 | 			if (!m_stage_buffer[i])
  80 | 				return false;
  81 | 		}
  82 | 	}
  83 | 
  84 | 	/////////////////
  85 | 	m_h_line_cache = new (std::nothrow) float*[m_thread_count];
  86 | 	if (!m_h_line_cache)
  87 | 		return false;
  88 | 
  89 | 	// zero just in case
  90 | 	for (int i = 0; i < m_thread_count; i++)
  91 | 		m_h_line_cache[i] = nullptr;
  92 | 
  93 | 	for (int i = 0; i < m_thread_count; i++)
  94 | 	{
  95 | 		m_h_line_cache[i] = (float*)_aligned_malloc(m_reserved_width * 12 * sizeof(float) * 2 + 128, ALIGN_SIZE);
  96 | 		if (!m_h_line_cache[i])
  97 | 			return false;
  98 | 
  99 | 		// 1st 8 bytes of line cache should remain constant zero
 100 | 		memset(m_h_line_cache[i], 0, 8 * sizeof(float));
 101 | 	}
 102 | 
 103 | 	////////////////
 104 | 	m_v_line_cache = new (std::nothrow) float*[m_thread_count];
 105 | 	if (!m_v_line_cache)
 106 | 		return false;
 107 | 
 108 | 	for (int i = 0; i < m_thread_count; i++)
 109 | 		m_v_line_cache[i] = nullptr;
 110 | 
 111 | 	int v_line_size = (m_reserved_width * 16 * sizeof(float)) / m_thread_count;
 112 | 	for (int i = 0; i < m_thread_count; i++)
 113 | 	{
 114 | 		m_v_line_cache[i] = (float*)_aligned_malloc(v_line_size, ALIGN_SIZE);
 115 | 		if (!m_v_line_cache[i])
 116 | 			return false;
 117 | 	}
 118 | 
 119 | 	return true;
 120 | }
 121 | 
 122 | void CRBFilterAVX2::release()
 123 | {
 124 | 	for (int i = 0; i < STAGE_BUFFER_COUNT; i++)
 125 | 	{
 126 | 		if (m_stage_buffer[i])
 127 | 		{
 128 | 			_aligned_free(m_stage_buffer[i]);
 129 | 			m_stage_buffer[i] = nullptr;
 130 | 		}
 131 | 	}
 132 | 
 133 | 	if (m_h_line_cache)
 134 | 	{
 135 | 		for (int i = 0; i < m_thread_count; i++)
 136 | 		{
 137 | 			if (m_h_line_cache[i])
 138 | 				_aligned_free(m_h_line_cache[i]);
 139 | 		}
 140 | 		delete[] m_h_line_cache;
 141 | 		m_h_line_cache = nullptr;
 142 | 	}
 143 | 
 144 | 	if (m_v_line_cache)
 145 | 	{
 146 | 		for (int i = 0; i < m_thread_count; i++)
 147 | 		{
 148 | 			if (m_v_line_cache[i])
 149 | 				_aligned_free(m_v_line_cache[i]);
 150 | 		}
 151 | 		delete[] m_v_line_cache;
 152 | 		m_v_line_cache = nullptr;
 153 | 	}
 154 | 
 155 | 	m_reserved_width = 0;
 156 | 	m_reserved_height = 0;
 157 | 	m_thread_count = 0;
 158 | 	m_pipelined = false;
 159 | 	m_filter_counter = 0;
 160 | }
 161 | 
 162 | int CRBFilterAVX2::getOptimalPitch(int width) const
 163 | {
 164 | 	width *= 4;
 165 | 
 166 | 	int round_up = ALIGN_SIZE * m_thread_count;
 167 | 	if (width % round_up)
 168 | 	{
 169 | 		width += round_up - width % round_up;
 170 | 	}
 171 | 
 172 | 	return width;
 173 | }
 174 | 
 175 | void CRBFilterAVX2::setSigma(float sigma_spatial, float sigma_range)
 176 | {
 177 | 	if (m_sigma_spatial != sigma_spatial || m_sigma_range != sigma_range)
 178 | 	{
 179 | 		m_sigma_spatial = sigma_spatial;
 180 | 		m_sigma_range = sigma_range;
 181 | 
 182 | 		double alpha_f = (exp(-sqrt(2.0) / (sigma_spatial * 255.0)));
 183 | 		m_inv_alpha_f = (float)(1.0 - alpha_f);
 184 | 		double inv_sigma_range = 1.0 / (sigma_range * MAX_RANGE_TABLE_SIZE);
 185 | 		{
 186 | 			double ii = 0.f;
 187 | 			for (int i = 0; i <= MAX_RANGE_TABLE_SIZE; i++, ii -= 1.0)
 188 | 			{
 189 | 				m_range_table[i] = (float)(alpha_f * exp(ii * inv_sigma_range));
 190 | 			}
 191 | 		}
 192 | 	}
 193 | }
 194 | 
 195 | // example of edge color difference calculation from original implementation
 196 | // idea is to fit maximum edge color difference as single number in 0-255 range
 197 | // colors are added then 2 components are scaled 4x while 1 complement is scaled 2x
 198 | // this means 1 of the components is more dominant 
 199 | 
 200 | //int getDiffFactor(const unsigned char* color1, const unsigned char* color2)
 201 | //{
 202 | //	int c1 = abs(color1[0] - color2[0]);
 203 | //	int c2 = abs(color1[1] - color2[1]);
 204 | //	int c3 = abs(color1[2] - color2[2]);
 205 | //
 206 | //	return ((c1 + c3) >> 2) + (c2 >> 1);
 207 | //}
 208 | 
 209 | 
 210 | inline void getDiffFactor3x(__m256i pix8, __m256i pix8p, __m256i* diff8x)
 211 | {
 212 | 	__m256i byte_mask = _mm256_set1_epi32(255);
 213 | 
 214 | 	// get absolute difference for each component per pixel
 215 | 	__m256i diff = _mm256_sub_epi8(_mm256_max_epu8(pix8, pix8p), _mm256_min_epu8(pix8, pix8p));
 216 | 
 217 | #ifdef EDGE_COLOR_USE_MAXIMUM
 218 | 	// get maximum of 3 components
 219 | 	__m256i diff_shift1 = _mm256_srli_epi32(diff, 8); // 2nd component
 220 | 	diff = _mm256_max_epu8(diff, diff_shift1);
 221 | 	diff_shift1 = _mm256_srli_epi32(diff_shift1, 8); // 3rd component
 222 | 	diff = _mm256_max_epu8(diff, diff_shift1);
 223 | 	// skip alpha component
 224 | 	diff = _mm256_and_si256(diff, byte_mask); // zero out all but 1st byte
 225 | #endif
 226 | 
 227 | #ifdef EDGE_COLOR_USE_ADDITION
 228 | 	// add all component differences and saturate 
 229 | 	__m256i diff_shift1 = _mm256_srli_epi32(diff, 8); // 2nd component
 230 | 	diff = _mm256_adds_epu8(diff, diff_shift1);
 231 | 	diff_shift1 = _mm256_srli_epi32(diff_shift1, 8); // 3rd component
 232 | 	diff = _mm256_adds_epu8(diff, diff_shift1);
 233 | 	diff = _mm256_and_si256(diff, byte_mask); // zero out all but 1st byte
 234 | #endif
 235 | 
 236 | 	_mm256_store_si256(diff8x, diff);
 237 | }
 238 | 
 239 | void CRBFilterAVX2::horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch)
 240 | {
 241 | 	// force height segments to be even cause this filter processes 2 lines at a time
 242 | 	int height_segment = (height / m_thread_count) & (~1); 
 243 | 	int buffer_offset = thread_index * height_segment * pitch;
 244 | 	img_src += buffer_offset;
 245 | 	img_dst += buffer_offset;
 246 | 
 247 | 	int width32 = pitch / 32;
 248 | 
 249 | 	// last segment should account for uneven height
 250 | 	// since reserve buffer height is rounded up to even number, it's OK if source is uneven
 251 | 	// but that assumes hozitonal filter output buffer is the reservered buffer, or that destination is rounded up to even number
 252 | 	if (thread_index + 1 == m_thread_count) 
 253 | 		height_segment = height - thread_index * height_segment;
 254 | 
 255 | //	float* alpha_cache_start = m_alpha_cache[thread_index];
 256 | 	// cache line structure: 
 257 | 	// 4 floats of alpha_f from line 1
 258 | 	// 4 floats of alpha_f from line 2
 259 | 	// 4 floats of source color premultiplied with 'm_inv_alpha_f' from line 1
 260 | 	// 4 floats of source color premultiplied with 'm_inv_alpha_f' from line 2
 261 | 	// 4 floats of 1st pass result color from line 1
 262 | 	// 4 floats of 1st pass result color from line 2
 263 | 	float* line_cache = m_h_line_cache[thread_index];
 264 | 	const float* range_table = m_range_table;
 265 | 
 266 | 	__declspec(align(32)) long color_diff[16];
 267 | 
 268 | 	_mm256_zeroall();
 269 | 
 270 | 	__m256i mask_unpack = _mm256_setr_epi8(12, -1, -1, -1,		// pixel 1 R
 271 | 		13, -1, -1, -1,		// pixel 1 G
 272 | 		14, -1, -1, -1,		// pixel 1 B
 273 | 		15, -1, -1, -1,		// pixel 1 A
 274 | 		12, -1, -1, -1, // pixel 2 R
 275 | 		13, -1, -1, -1, // pixel 2 G
 276 | 		14, -1, -1, -1, // pixel 2 B
 277 | 		15, -1, -1, -1);// pixel 2 A
 278 | 
 279 | 	__m256i mask_pack = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, // pixel 1
 280 | 		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); // pixel 2
 281 | 
 282 | 	__m256 inv_alpha = _mm256_set1_ps(m_inv_alpha_f);
 283 | 
 284 | 	// process 2 horizontal lines at a time
 285 | 	for (int y = 0; y < height_segment; y+= 2)
 286 | 	{
 287 | 		__m256 alpha_prev = _mm256_set1_ps(1.f);
 288 | 		__m256 color_prev;
 289 | 
 290 | 		
 291 | 		float* line_buffer = line_cache + 24 * pitch / 4;
 292 | 		// 1st line
 293 | 		int buffer_inc = (y + 1) * pitch - 32;
 294 | 		const __m256i* src1_8xCur = (const __m256i*)(img_src + buffer_inc);
 295 | 		const __m256i* src1_8xPrev = (const __m256i*)(img_src + buffer_inc + 4);
 296 | 		// 2nd line
 297 | 		buffer_inc += pitch;
 298 | 		const __m256i* src2_8xCur = (const __m256i*)(img_src + buffer_inc);
 299 | 		const __m256i* src2_8xPrev = (const __m256i*)(img_src + buffer_inc + 4);
 300 | 		
 301 | 
 302 | 		/////////////////////////////
 303 | 		// right to left pass
 304 | 		for (int x = 0; x < width32; x++)
 305 | 		{
 306 | 			__m256i pix8_1 = _mm256_load_si256(src1_8xCur--);
 307 | 			__m256i pix8p_1 = _mm256_loadu_si256(src1_8xPrev--);
 308 | 			getDiffFactor3x(pix8_1, pix8p_1, (__m256i*)color_diff);
 309 | 
 310 | 			__m256i pix8_2 = _mm256_load_si256(src2_8xCur--);
 311 | 			__m256i pix8p_2 = _mm256_loadu_si256(src2_8xPrev--);
 312 | 			getDiffFactor3x(pix8_2, pix8p_2, (__m256i*)(color_diff + 8));
 313 | 
 314 | 			// last 4 pixels of 2 lines
 315 | 			__m256i pix8 = _mm256_permute2f128_si256(pix8_1, pix8_2, 1 | (3 << 4));
 316 | 
 317 | 			////////////////////
 318 | 			// pixel 1 unpack
 319 | 			{
 320 | 				// alpha factor
 321 | 				float alpha2_f = range_table[color_diff[7]];
 322 | 				float alpha1_f = range_table[color_diff[7 + 8]];
 323 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 324 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 325 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 326 | 
 327 | 				// source pixel
 328 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 329 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 330 | 				if (x == 0) // have to initialize prev_color with last pixel color, this condition has no noticeable penalty
 331 | 					color_prev = pix2f;
 332 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 333 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 334 | 
 335 | 				// filter 
 336 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 337 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 338 | 
 339 | 				// final color
 340 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 341 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 342 | 				line_buffer -= 24;
 343 | 			}
 344 | 
 345 | 			////////////////////
 346 | 			// pixel 2 unpack
 347 | 			{
 348 | 				// alpha factor
 349 | 				float alpha2_f = range_table[color_diff[6]];
 350 | 				float alpha1_f = range_table[color_diff[6 + 8]];
 351 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 352 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 353 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 354 | 
 355 | 				// source pixel
 356 | 				pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel
 357 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 358 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 359 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 360 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 361 | 
 362 | 				// filter 
 363 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 364 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 365 | 
 366 | 				// final color
 367 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 368 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 369 | 				line_buffer -= 24;
 370 | 			}
 371 | 
 372 | 
 373 | 			////////////////////
 374 | 			// pixel 3 unpack
 375 | 			{
 376 | 				// alpha factor
 377 | 				float alpha2_f = range_table[color_diff[5]];
 378 | 				float alpha1_f = range_table[color_diff[5 + 8]];
 379 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 380 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 381 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 382 | 
 383 | 				// source pixel
 384 | 				pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel
 385 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 386 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 387 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 388 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 389 | 
 390 | 				// filter 
 391 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 392 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 393 | 
 394 | 				// final color
 395 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 396 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 397 | 				line_buffer -= 24;
 398 | 			}
 399 | 
 400 | 			////////////////////
 401 | 			// pixel 4 unpack
 402 | 			{
 403 | 				// alpha factor
 404 | 				float alpha2_f = range_table[color_diff[4]];
 405 | 				float alpha1_f = range_table[color_diff[4 + 8]];
 406 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 407 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 408 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 409 | 
 410 | 				// source pixel
 411 | 				pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel
 412 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 413 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 414 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 415 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 416 | 
 417 | 				// filter 
 418 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 419 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 420 | 
 421 | 				// final color
 422 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 423 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 424 | 				line_buffer -= 24;
 425 | 			}
 426 | 
 427 | 			// next 4 pixels of 2 lines
 428 | 			pix8 = _mm256_permute2f128_si256(pix8_1, pix8_2, 2 << 4);
 429 | 
 430 | 
 431 | 			////////////////////
 432 | 			// pixel 5 unpack
 433 | 			{
 434 | 				// alpha factor
 435 | 				float alpha2_f = range_table[color_diff[3]];
 436 | 				float alpha1_f = range_table[color_diff[3 + 8]];
 437 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 438 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 439 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 440 | 
 441 | 				// source pixel
 442 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 443 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 444 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 445 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 446 | 
 447 | 				// filter 
 448 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 449 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 450 | 
 451 | 																			 // final color
 452 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 453 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 454 | 				line_buffer -= 24;
 455 | 			}
 456 | 
 457 | 
 458 | 			////////////////////
 459 | 			// pixel 6 unpack
 460 | 			{
 461 | 				// alpha factor
 462 | 				float alpha2_f = range_table[color_diff[2]];
 463 | 				float alpha1_f = range_table[color_diff[2 + 8]];
 464 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 465 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 466 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 467 | 
 468 | 				// source pixel
 469 | 				pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel
 470 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 471 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 472 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 473 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 474 | 
 475 | 				// filter 
 476 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 477 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 478 | 
 479 | 				// final color
 480 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 481 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 482 | 				line_buffer -= 24;
 483 | 			}
 484 | 
 485 | 
 486 | 			////////////////////
 487 | 			// pixel 7 unpack
 488 | 			{
 489 | 				// alpha factor
 490 | 				float alpha2_f = range_table[color_diff[1]];
 491 | 				float alpha1_f = range_table[color_diff[1 + 8]];
 492 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 493 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 494 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 495 | 
 496 | 				// source pixel
 497 | 				pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel
 498 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 499 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 500 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 501 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 502 | 
 503 | 				// filter 
 504 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 505 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 506 | 
 507 | 				// final color
 508 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 509 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 510 | 				line_buffer -= 24;
 511 | 			}
 512 | 
 513 | 
 514 | 			////////////////////
 515 | 			// pixel 8 unpack
 516 | 			{
 517 | 				// alpha factor
 518 | 				float alpha2_f = range_table[color_diff[0]];
 519 | 				float alpha1_f = range_table[color_diff[0 + 8]];
 520 | 				__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 521 | 					alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 522 | 				_mm256_store_ps(line_buffer, alpha_f_8x); // cache weights
 523 | 
 524 | 				// source pixel
 525 | 				pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel
 526 | 				__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD
 527 | 				__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 528 | 				pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color
 529 | 				_mm256_store_ps(line_buffer + 8, pix2f);
 530 | 
 531 | 				// filter 
 532 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 533 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 534 | 
 535 | 				// final color
 536 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 537 | 				_mm256_store_ps(line_buffer + 16, out_color); // cache final color
 538 | 				line_buffer -= 24;
 539 | 			}
 540 | 
 541 | 			
 542 | 		}
 543 | 
 544 | 		/////////////////////////////
 545 | 		// left to right pass
 546 | 		__m256i* dst1_pix8 = (__m256i*)(img_dst + y * pitch);
 547 | 		__m256i* dst2_pix8 = (__m256i*)(img_dst + (y + 1) * pitch);
 548 | 
 549 | 		for (int x = 0; x < width32; x++)
 550 | 		{
 551 | 			__m256i result1;
 552 | 			__m256i result2;
 553 | 
 554 | 			/////////////
 555 | 			// 1st 4 pixels
 556 | 			// pixel 1
 557 | 			{
 558 | 				// alpha
 559 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 560 | 				line_buffer += 24;
 561 | 
 562 | 				// get pre-multiplied source color
 563 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 564 | 
 565 | 				// first pixel in line needs to initialize color_prev to original source color
 566 | 				if (x == 0)
 567 | 					color_prev = _mm256_div_ps(pix2f, inv_alpha); // source color was premultiplied
 568 | 
 569 | 				// filter 
 570 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 571 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 572 | 
 573 | 				// final color 
 574 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 575 | 
 576 | 				// get final color from previous pass
 577 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 578 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 579 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 580 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 581 | 
 582 | 				// pack result
 583 | 				result1 = _mm256_shuffle_epi8(pix2i, mask_pack);
 584 | 			}
 585 | 			
 586 | 
 587 | 			// pixel 2
 588 | 			{
 589 | 				// alpha
 590 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 591 | 				line_buffer += 24;
 592 | 
 593 | 				// get pre-multiplied source color
 594 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 595 | 
 596 | 				// filter 
 597 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 598 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 599 | 
 600 | 				// final color
 601 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 602 | 
 603 | 				// get final color from previous pass
 604 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 605 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 606 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 607 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 608 | 
 609 | 				// pack result
 610 | 				pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 611 | 				result1 = _mm256_srli_si256(result1, 4); // shift 
 612 | 				result1 = _mm256_or_si256(result1, pix2i); // combine
 613 | 			}
 614 | 
 615 | 			// pixel 3
 616 | 			{
 617 | 				// alpha
 618 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 619 | 				line_buffer += 24;
 620 | 
 621 | 				// get pre-multiplied source color
 622 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 623 | 
 624 | 				// filter 
 625 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 626 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 627 | 
 628 | 				// final color
 629 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 630 | 
 631 | 				// get final color from previous pass
 632 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 633 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 634 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 635 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 636 | 
 637 | 				// pack result
 638 | 				pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 639 | 				result1 = _mm256_srli_si256(result1, 4); // shift 
 640 | 				result1 = _mm256_or_si256(result1, pix2i); // combine
 641 | 			}
 642 | 
 643 | 			// pixel 4
 644 | 			{
 645 | 				// alpha
 646 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 647 | 				line_buffer += 24;
 648 | 
 649 | 				// get pre-multiplied source color
 650 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 651 | 
 652 | 				// filter 
 653 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 654 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 655 | 
 656 | 				// final color
 657 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 658 | 
 659 | 				// get final color from previous pass
 660 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 661 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 662 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 663 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 664 | 
 665 | 				// pack result
 666 | 				pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 667 | 				result1 = _mm256_srli_si256(result1, 4); // shift 
 668 | 				result1 = _mm256_or_si256(result1, pix2i); // combine
 669 | 			}
 670 | 		
 671 | 			// next 4 pixels packed in result2
 672 | 			// pixel 5	
 673 | 			{
 674 | 				// alpha
 675 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 676 | 				line_buffer += 24;
 677 | 
 678 | 				// get pre-multiplied source color
 679 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 680 | 
 681 | 				// filter 
 682 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 683 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 684 | 
 685 | 				// final color
 686 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 687 | 
 688 | 				// get final color from previous pass
 689 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 690 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 691 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 692 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 693 | 
 694 | 				// pack result
 695 | 				result2 = _mm256_shuffle_epi8(pix2i, mask_pack);
 696 | 			}
 697 | 			
 698 | 			// pixel 6
 699 | 			{
 700 | 				// alpha
 701 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 702 | 				line_buffer += 24;
 703 | 
 704 | 				// get pre-multiplied source color
 705 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 706 | 
 707 | 				// filter 
 708 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 709 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 710 | 
 711 | 				// final color
 712 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 713 | 
 714 | 				// get final color from previous pass
 715 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 716 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 717 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 718 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 719 | 
 720 | 				// pack result
 721 | 				pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 722 | 				result2 = _mm256_srli_si256(result2, 4); // shift 
 723 | 				result2 = _mm256_or_si256(result2, pix2i); // combine
 724 | 			}
 725 | 
 726 | 			// pixel 7
 727 | 			{
 728 | 				// alpha
 729 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 730 | 				line_buffer += 24;
 731 | 
 732 | 				// get pre-multiplied source color
 733 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 734 | 
 735 | 				// filter 
 736 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 737 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 738 | 
 739 | 				// final color
 740 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 741 | 
 742 | 				// get final color from previous pass
 743 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 744 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 745 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 746 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 747 | 
 748 | 				// pack result
 749 | 				pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 750 | 				result2 = _mm256_srli_si256(result2, 4); // shift 
 751 | 				result2 = _mm256_or_si256(result2, pix2i); // combine
 752 | 			}
 753 | 
 754 | 			// pixel 8
 755 | 			{
 756 | 				// alpha
 757 | 				__m256 alpha_f_8x = _mm256_load_ps(line_buffer);
 758 | 				line_buffer += 24;
 759 | 
 760 | 				// get pre-multiplied source color
 761 | 				__m256 pix2f = _mm256_load_ps(line_buffer + 8);
 762 | 
 763 | 				// filter 
 764 | 				alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 765 | 				color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 766 | 
 767 | 				// final color
 768 | 				__m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color
 769 | 
 770 | 				// get final color from previous pass
 771 | 				__m256 pix2f_p = _mm256_load_ps(line_buffer + 16);
 772 | 				out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color
 773 | 				__m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer
 774 | 				pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2
 775 | 
 776 | 				// pack result
 777 | 				pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 778 | 				result2 = _mm256_srli_si256(result2, 4); // shift 
 779 | 				result2 = _mm256_or_si256(result2, pix2i); // combine
 780 | 			}
 781 | 
 782 | 			// separate packed results into lines
 783 | 			__m256i line1 = _mm256_permute2f128_si256(result1, result2, 2 << 4);
 784 | 			__m256i line2 = _mm256_permute2f128_si256(result1, result2, 1 | (3 << 4));
 785 | 
 786 | 			// store result
 787 | 			_mm256_store_si256(dst1_pix8++, line1);
 788 | 			_mm256_store_si256(dst2_pix8++, line2);
 789 | 		}
 790 | 	}
 791 | 
 792 | }
 793 | 
 794 | 
 795 | void CRBFilterAVX2::verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch)
 796 | {
 797 | 	int width_segment = width / m_thread_count;
 798 | 	// make sure width segments round to 32 byte boundary
 799 | 	width_segment -= width_segment % 8;
 800 | 	int start_offset = width_segment * thread_index;
 801 | 	if (thread_index == m_thread_count - 1) // last one
 802 | 	{
 803 | 		width_segment = getOptimalPitch(width) / 4 - start_offset;
 804 | 	}
 805 | 
 806 | 	int width8 = width_segment / 8;
 807 | 
 808 | 	// adjust img buffer starting positions
 809 | 	img_src += start_offset * 4;
 810 | 	img_dst += start_offset * 4;
 811 | 
 812 | 	float* line_cache = m_v_line_cache[thread_index];
 813 | 	const float* range_table = m_range_table;
 814 | 
 815 | 	_mm256_zeroall();
 816 | 
 817 | 	__m256 inv_alpha = _mm256_set1_ps(m_inv_alpha_f);
 818 | 
 819 | 	__m256i mask_pack = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, // pixel 1
 820 | 		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); // pixel 2
 821 | 
 822 | 	__m256i mask_unpack = _mm256_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, // pixel 1
 823 | 										0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1); // pixel 2
 824 | 
 825 | 	// used to store maximum difference between 2 pixels
 826 | 	__declspec(align(32)) long color_diff[8];
 827 | 
 828 | 	/////////////////
 829 | 	// Bottom to top pass first
 830 | 	{
 831 | 		// last line processed separately since no previous
 832 | 		{
 833 | 			float* line_buffer = line_cache;
 834 | 			__m256i* dst_buf = (__m256i*)(img_dst + (height - 1) * pitch);
 835 | 			__m256i* src_8xCur = (__m256i*)(img_src + (height - 1) * pitch);
 836 | 
 837 | 			__m256 one = _mm256_set1_ps(1.f);
 838 | 
 839 | 			for (int x = 0; x < width8; x++)
 840 | 			{
 841 | 				__m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel
 842 | 				_mm256_store_si256(dst_buf++, pix8); // copy to destination
 843 | 
 844 | 				for (int i = 0; i < 4; i++)
 845 | 				{
 846 | 					__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack);
 847 | 					pix8 = _mm256_srli_si256(pix8, 4); // shift right
 848 | 					__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 849 | 
 850 | 					_mm256_store_ps(line_buffer, one);
 851 | 					_mm256_store_ps(line_buffer + 8, pix2f);
 852 | 
 853 | 					line_buffer += 16;
 854 | 				}
 855 | 			}
 856 | 		}
 857 | 
 858 | 		// process other lines
 859 | 		for (int y = height - 2; y >= 0; y--)
 860 | 		{
 861 | 			float* line_buffer = line_cache;
 862 | 			__m256i* dst_buf = (__m256i*)(img_dst + y * pitch);
 863 | 			__m256i* src_8xCur = (__m256i*)(img_src + y * pitch);
 864 | 			__m256i* src_8xPrev = (__m256i*)(img_src + (y + 1) * pitch);
 865 | 
 866 | 			for (int x = 0; x < width8; x++)
 867 | 			{
 868 | 				__m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel
 869 | 				__m256i pix8p = _mm256_load_si256(src_8xPrev++);
 870 | 				__m256i pix_out; // final 8x packed pixels
 871 | 
 872 | 				// get color differences
 873 | 				getDiffFactor3x(pix8, pix8p, (__m256i*)color_diff);
 874 | 
 875 | 				////////////////////
 876 | 				// pixel 1, 5 unpack
 877 | 				{
 878 | 					// alpha factor
 879 | 					float alpha2_f = range_table[color_diff[0]];
 880 | 					float alpha1_f = range_table[color_diff[4]];
 881 | 					__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 882 | 						alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 883 | 					
 884 | 					// load previous line color factor
 885 | 					__m256 alpha_prev = _mm256_load_ps(line_buffer);
 886 | 					// load previous line color
 887 | 					__m256 color_prev = _mm256_load_ps(line_buffer + 8);
 888 | 					
 889 | 					// unpack current source pixel
 890 | 					__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack);
 891 | 					__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 892 | 
 893 | 					// filter
 894 | 					pix2f = _mm256_mul_ps(pix2f, inv_alpha);
 895 | 					alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 896 | 					color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 897 | 					
 898 | 					// store current factor and color as previous for next cycle
 899 | 					_mm256_store_ps(line_buffer, alpha_prev);
 900 | 					_mm256_store_ps(line_buffer + 8, color_prev);
 901 | 					line_buffer += 16;
 902 | 
 903 | 					// calculate final color
 904 | 					pix2f = _mm256_div_ps(color_prev, alpha_prev);
 905 | 
 906 | 					// pack float pixel into byte pixel
 907 | 					pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer
 908 | 					pix_out = _mm256_shuffle_epi8(pix2i, mask_pack);
 909 | 				}
 910 | 				
 911 | 				// loop for other pixels
 912 | 				for(int i=1; i<4; i++)
 913 | 				{
 914 | 					// alpha factor
 915 | 					float alpha2_f = range_table[color_diff[i]];
 916 | 					float alpha1_f = range_table[color_diff[i+4]];
 917 | 					__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
 918 | 						alpha2_f, alpha2_f, alpha2_f, alpha2_f);
 919 | 
 920 | 					// load previous line color factor
 921 | 					__m256 alpha_prev = _mm256_load_ps(line_buffer);
 922 | 					// load previous line color
 923 | 					__m256 color_prev = _mm256_load_ps(line_buffer + 8);
 924 | 
 925 | 					// unpack current source pixel
 926 | 					pix8 = _mm256_srli_si256(pix8, 4); // shift right
 927 | 					__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack);
 928 | 					__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 929 | 
 930 | 					// filter
 931 | 					pix2f = _mm256_mul_ps(pix2f, inv_alpha);
 932 | 					alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
 933 | 					color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
 934 | 
 935 | 					// store current factor and color as previous for next cycle
 936 | 					_mm256_store_ps(line_buffer, alpha_prev);
 937 | 					_mm256_store_ps(line_buffer + 8, color_prev);
 938 | 					line_buffer += 16;
 939 | 
 940 | 					// calculate final color
 941 | 					pix2f = _mm256_div_ps(color_prev, alpha_prev);
 942 | 
 943 | 					// pack float pixel into byte pixel
 944 | 					pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer
 945 | 					pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
 946 | 					pix_out = _mm256_srli_si256(pix_out, 4); // shift 
 947 | 					pix_out = _mm256_or_si256(pix_out, pix2i); // combine
 948 | 				}
 949 | 				
 950 | 				// store result
 951 | 				_mm256_store_si256(dst_buf++, pix_out);
 952 | 			}
 953 | 		}
 954 | 	}
 955 | 
 956 | 	/////////////////
 957 | 	// Top to bottom pass last
 958 | 	{
 959 | 
 960 | 		// first line processed separately since no previous
 961 | 		{
 962 | 			float* line_buffer = line_cache;
 963 | 			__m256i* dst_line = (__m256i*)img_dst;
 964 | 			__m256i* src_8xCur = (__m256i*)img_src;
 965 | 
 966 | 			__m256 one = _mm256_set1_ps(1.f);
 967 | 
 968 | 			for (int x = 0; x < width8; x++)
 969 | 			{
 970 | 				__m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel
 971 | 				__m256i pix8_d = _mm256_load_si256(dst_line);
 972 | 				pix8_d = _mm256_avg_epu8(pix8_d, pix8); // average out
 973 | 				_mm256_store_si256(dst_line++, pix8_d);
 974 | 
 975 | 				for (int i = 0; i < 4; i++)
 976 | 				{
 977 | 					__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack);
 978 | 					pix8 = _mm256_srli_si256(pix8, 4); // shift right
 979 | 					__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
 980 | 
 981 | 					_mm256_store_ps(line_buffer, one);
 982 | 					_mm256_store_ps(line_buffer + 8, pix2f);
 983 | 
 984 | 					line_buffer += 16;
 985 | 				}
 986 | 			}
 987 | 		}
 988 | 
 989 | 		// process other lines
 990 | 		for (int y = 1; y < height; y++)
 991 | 		{
 992 | 			float* line_buffer = line_cache;
 993 | 			__m256i* dst_buf = (__m256i*)(img_dst + y * pitch);
 994 | 			__m256i* src_8xCur = (__m256i*)(img_src + y * pitch);
 995 | 			__m256i* src_8xPrev = (__m256i*)(img_src + (y - 1) * pitch);
 996 | 
 997 | 			for (int x = 0; x < width8; x++)
 998 | 			{
 999 | 				__m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel
1000 | 				__m256i pix8p = _mm256_load_si256(src_8xPrev++);
1001 | 				__m256i pix_out; // final 8x packed pixels
1002 | 
1003 | 				// get color differences
1004 | 				getDiffFactor3x(pix8, pix8p, (__m256i*)color_diff);
1005 | 
1006 | 				////////////////////
1007 | 				// pixel 1, 5 unpack
1008 | 				{
1009 | 					// alpha factor
1010 | 					float alpha2_f = range_table[color_diff[0]];
1011 | 					float alpha1_f = range_table[color_diff[4]];
1012 | 					__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
1013 | 						alpha2_f, alpha2_f, alpha2_f, alpha2_f);
1014 | 
1015 | 					// load previous line color factor
1016 | 					__m256 alpha_prev = _mm256_load_ps(line_buffer);
1017 | 					// load previous line color
1018 | 					__m256 color_prev = _mm256_load_ps(line_buffer + 8);
1019 | 
1020 | 					// unpack current source pixel
1021 | 					__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack);
1022 | 					__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
1023 | 
1024 | 					// filter
1025 | 					pix2f = _mm256_mul_ps(pix2f, inv_alpha);
1026 | 					alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
1027 | 					color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
1028 | 
1029 | 					// store current factor and color as previous for next cycle
1030 | 					_mm256_store_ps(line_buffer, alpha_prev);
1031 | 					_mm256_store_ps(line_buffer + 8, color_prev);
1032 | 					line_buffer += 16;
1033 | 
1034 | 					// calculate final color
1035 | 					pix2f = _mm256_div_ps(color_prev, alpha_prev);
1036 | 
1037 | 					// pack float pixel into byte pixel
1038 | 					pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer
1039 | 					pix_out = _mm256_shuffle_epi8(pix2i, mask_pack);
1040 | 				}
1041 | 
1042 | 				// loop for other pixels
1043 | 				for (int i = 1; i<4; i++)
1044 | 				{
1045 | 					// alpha factor
1046 | 					float alpha2_f = range_table[color_diff[i]];
1047 | 					float alpha1_f = range_table[color_diff[i + 4]];
1048 | 					__m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f,
1049 | 						alpha2_f, alpha2_f, alpha2_f, alpha2_f);
1050 | 
1051 | 					// load previous line color factor
1052 | 					__m256 alpha_prev = _mm256_load_ps(line_buffer);
1053 | 					// load previous line color
1054 | 					__m256 color_prev = _mm256_load_ps(line_buffer + 8);
1055 | 
1056 | 					// unpack current source pixel
1057 | 					pix8 = _mm256_srli_si256(pix8, 4); // shift right
1058 | 					__m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack);
1059 | 					__m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats
1060 | 
1061 | 					// filter
1062 | 					pix2f = _mm256_mul_ps(pix2f, inv_alpha);
1063 | 					alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor
1064 | 					color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color
1065 | 
1066 | 					// store current factor and color as previous for next cycle
1067 | 					_mm256_store_ps(line_buffer, alpha_prev);
1068 | 					_mm256_store_ps(line_buffer + 8, color_prev);
1069 | 					line_buffer += 16;
1070 | 
1071 | 					// calculate final color
1072 | 					pix2f = _mm256_div_ps(color_prev, alpha_prev);
1073 | 
1074 | 					// pack float pixel into byte pixel
1075 | 					pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer
1076 | 					pix2i = _mm256_shuffle_epi8(pix2i, mask_pack);
1077 | 					pix_out = _mm256_srli_si256(pix_out, 4); // shift 
1078 | 					pix_out = _mm256_or_si256(pix_out, pix2i); // combine
1079 | 				}
1080 | 
1081 | 				// average result with previous values in destination buffer
1082 | 				__m256i pix8_d = _mm256_load_si256(dst_buf);
1083 | 				pix_out = _mm256_avg_epu8(pix8_d, pix_out);
1084 | 				_mm256_store_si256(dst_buf++, pix_out);
1085 | 			}
1086 | 		}
1087 | 	}
1088 | }
1089 | 
1090 | bool CRBFilterAVX2::filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch)
1091 | {
1092 | 	// basic error checking
1093 | 	if (!m_stage_buffer[0])
1094 | 		return false;
1095 | 
1096 | 	if (width < 32 || width > m_reserved_width)
1097 | 		return false;
1098 | 
1099 | 	if (height < 16 || height > m_reserved_height)
1100 | 		return false;
1101 | 
1102 | 	if (pitch < width * 4)
1103 | 		return false;
1104 | 
1105 | 	if (!out_data || !in_data)
1106 | 		return false;
1107 | 
1108 | 	if (m_inv_alpha_f == 0.f)
1109 | 		return false;
1110 | 
1111 | 	int thread_count_adjusted = m_thread_count - 1;
1112 | 
1113 | 	//////////////////////////////////////////////
1114 | 	// horizontal filter divided in threads
1115 | 	for (int i = 0; i < thread_count_adjusted; i++)
1116 | 	{
1117 | 		m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::horizontalFilter, this, i, in_data, m_stage_buffer[0], width, height, pitch);
1118 | 	}
1119 | 
1120 | 	// use this thread for last segment
1121 | 	horizontalFilter(thread_count_adjusted, in_data, m_stage_buffer[0], width, height, pitch);
1122 | 
1123 | 	// wait for result
1124 | 	for (int i = 0; i < thread_count_adjusted; i++)
1125 | 	{
1126 | 		m_horizontal_tasks[i].get();
1127 | 	}
1128 | 
1129 | 	///////////////////////////////////////////// 
1130 | 	// vertical filter divided in threads
1131 | 	for (int i = 0; i < thread_count_adjusted; i++)
1132 | 	{
1133 | 		m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::verticalFilter, this, i, m_stage_buffer[0], out_data, width, height, pitch);
1134 | 	}
1135 | 
1136 | 	// use this thread for last segment
1137 | 	verticalFilter(thread_count_adjusted, m_stage_buffer[0], out_data, width, height, pitch);
1138 | 
1139 | 	// wait for result
1140 | 	for (int i = 0; i < thread_count_adjusted; i++)
1141 | 	{
1142 | 		m_vertical_tasks[i].get();
1143 | 	}
1144 | 
1145 | 	return true;
1146 | }
1147 | 
1148 | bool CRBFilterAVX2::filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch)
1149 | {
1150 | 	// basic error checking
1151 | 	if (!m_stage_buffer[0])
1152 | 		return false;
1153 | 
1154 | 	if (width < 16 || width > m_reserved_width)
1155 | 		return false;
1156 | 
1157 | 	if (height < 16 || height > m_reserved_height)
1158 | 		return false;
1159 | 
1160 | 	if (pitch < width * 4)
1161 | 		return false;
1162 | 
1163 | 	if (m_inv_alpha_f == 0.f)
1164 | 		return false;
1165 | 
1166 | 	m_image_width = width;
1167 | 	m_image_height = height;
1168 | 	m_image_pitch = pitch;
1169 | 
1170 | 	// block until last frame finished 1st stage
1171 | 	for (int i = 0; i < m_thread_count; i++)
1172 | 	{
1173 | 		if(m_horizontal_tasks[i].valid())
1174 | 			m_horizontal_tasks[i].get();
1175 | 	}
1176 | 
1177 | 	int previous_stage_index = (m_filter_counter - 1) % STAGE_BUFFER_COUNT;
1178 | 	int current_stage_index = m_filter_counter % STAGE_BUFFER_COUNT;
1179 | 	m_filter_counter++;
1180 | 	m_out_buffer[current_stage_index] = out_data;
1181 | 
1182 | 	// start new horizontal stage
1183 | 	if (in_data)
1184 | 	{
1185 | 		// start first stage for current frame
1186 | 		for (int i = 0; i < m_thread_count; i++)
1187 | 		{
1188 | 			m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::horizontalFilter, this, i, in_data, m_stage_buffer[current_stage_index], width, height, pitch);
1189 | 		}
1190 | 	}
1191 | 
1192 | 	// block until last frame finished 2nd stage
1193 | 	for (int i = 0; i < m_thread_count; i++)
1194 | 	{
1195 | 		if (m_vertical_tasks[i].valid())
1196 | 			m_vertical_tasks[i].get();
1197 | 	}
1198 | 
1199 | 	// start new vertical stage based on result of previous stage
1200 | 	if (previous_stage_index >= 0 && m_out_buffer[previous_stage_index])
1201 | 	{
1202 | 		// start first stage for current frame
1203 | 		for (int i = 0; i < m_thread_count; i++)
1204 | 		{
1205 | 			m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::verticalFilter, this, i, m_stage_buffer[previous_stage_index], m_out_buffer[previous_stage_index], width, height, pitch);
1206 | 		}
1207 | 	}
1208 | 	
1209 | 	return true;
1210 | }
1211 | 
1212 | void CRBFilterAVX2::filterPipeFlush()
1213 | {
1214 | 	filterPipePush(nullptr, nullptr, m_image_width, m_image_height, m_image_pitch);
1215 | 
1216 | 	if (m_filter_counter > 0)
1217 | 	{
1218 | 		for (int i = 0; i < m_thread_count; i++)
1219 | 		{
1220 | 			if(m_vertical_tasks[i].valid())
1221 | 				m_vertical_tasks[i].get();
1222 | 		}
1223 | 	}
1224 | 
1225 | 	m_filter_counter = 0;
1226 | }


--------------------------------------------------------------------------------