├── .gitignore
├── media
    ├── AVX2.gif
    ├── Cross.gif
    ├── glsl.png
    ├── AVX512.gif
    ├── Serial.gif
    ├── SSE-NEON.gif
    ├── qTriangle.aep
    ├── CrossMethod.gif
    └── BarycentricMethod.gif
├── .gitmodules
├── include
    └── qTriangle
    │   ├── Util.hpp
    │   ├── qTriangle.hpp
    │   └── Types.hpp
├── source
    └── qTriangle
    │   ├── Util.cpp
    │   ├── qTriangle.cpp
    │   └── qTriangle-x86.hpp
├── test
    ├── Bench.hpp
    ├── Display.cpp
    ├── Benchmark.cpp
    ├── FillShape.cpp
    └── stb_image_write.h
├── LICENSE
├── CMakeLists.txt
├── scripts
    └── GenGifs.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .*
3 | !/.gitignore
4 | 


--------------------------------------------------------------------------------
/media/AVX2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/AVX2.gif


--------------------------------------------------------------------------------
/media/Cross.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/Cross.gif


--------------------------------------------------------------------------------
/media/glsl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/glsl.png


--------------------------------------------------------------------------------
/media/AVX512.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/AVX512.gif


--------------------------------------------------------------------------------
/media/Serial.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/Serial.gif


--------------------------------------------------------------------------------
/media/SSE-NEON.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/SSE-NEON.gif


--------------------------------------------------------------------------------
/media/qTriangle.aep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/qTriangle.aep


--------------------------------------------------------------------------------
/media/CrossMethod.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/CrossMethod.gif


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "extern/glm"]
2 | 	path = extern/glm
3 | 	url = git@github.com:g-truc/glm.git
4 | 


--------------------------------------------------------------------------------
/media/BarycentricMethod.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wunkolo/qTriangle/HEAD/media/BarycentricMethod.gif


--------------------------------------------------------------------------------
/include/qTriangle/Util.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace qTri
 4 | {
 5 | class Image;
 6 | 
 7 | namespace Util
 8 | {
 9 | void Draw(const qTri::Image& Frame);
10 | }
11 | }
12 | 


--------------------------------------------------------------------------------
/include/qTriangle/qTriangle.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstddef>
 3 | #include <cstdint>
 4 | #include <algorithm>
 5 | #include <vector>
 6 | #include <tuple>
 7 | 
 8 | #include "Types.hpp"
 9 | #include "Util.hpp"
10 | 
11 | namespace qTri
12 | {
13 | extern const std::vector<
14 | 	std::pair<
15 | 		void(* const)(
16 | 			const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
17 | 			const Triangle& Tri
18 | 		),
19 | 		const char*
20 | 	>
21 | > FillAlgorithms;
22 | }
23 | 


--------------------------------------------------------------------------------
/include/qTriangle/Types.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <tuple>
 3 | #include <vector>
 4 | #include <array>
 5 | 
 6 | #include <glm/fwd.hpp>
 7 | #include <glm/vec2.hpp>
 8 | 
 9 | namespace qTri
10 | {
11 | class Image
12 | {
13 | public:
14 | 	Image(std::size_t Width, std::size_t Height)
15 | 		: Width(Width),
16 | 		Height(Height)
17 | 	{
18 | 		Pixels.resize(Width * Height);
19 | 	}
20 | 
21 | 	std::size_t Width;
22 | 	std::size_t Height;
23 | 	std::vector<std::uint8_t> Pixels;
24 | };
25 | 
26 | using Triangle = std::array<glm::i32vec2,3>;
27 | }
28 | 


--------------------------------------------------------------------------------
/source/qTriangle/Util.cpp:
--------------------------------------------------------------------------------
 1 | #include <qTriangle/Util.hpp>
 2 | #include <qTriangle/Types.hpp>
 3 | 
 4 | namespace qTri
 5 | {
 6 | namespace Util
 7 | {
 8 | void Draw(const qTri::Image& Frame)
 9 | {
10 | 	for( std::size_t y = 0; y < Frame.Height; ++y )
11 | 	{
12 | 		std::fputs("\033[0;35m|\033[1;36m", stdout);
13 | 		for( std::size_t x = 0; x < Frame.Width; ++x )
14 | 		{
15 | 			std::putchar(
16 | 				" @"[Frame.Pixels[x + y * Frame.Width] & 1]
17 | 			);
18 | 		}
19 | 		std::fputs("\033[0;35m|\n", stdout);
20 | 	}
21 | 	std::fputs("\033[0m", stdout);
22 | }
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/test/Bench.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <chrono>
 3 | 
 4 | // Measures the time it takes to execute a execute a function
 5 | 
 6 | template< typename TimeT = std::chrono::nanoseconds >
 7 | struct Bench
 8 | {
 9 | 	template< typename FunctionT, typename ...ArgsT >
10 | 	static TimeT Duration(FunctionT&& Func, ArgsT&&... Arguments)
11 | 	{
12 | 		// Start time
13 | 		const auto Start = std::chrono::high_resolution_clock::now();
14 | 		// Run function, perfect-forward arguments
15 | 		std::forward<decltype(Func)>(Func)(std::forward<ArgsT>(Arguments)...);
16 | 		// Return executation time.
17 | 		return std::chrono::duration_cast<TimeT>(
18 | 			std::chrono::high_resolution_clock::now() - Start
19 | 		);
20 | 	}
21 | };
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Wunkolo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required( VERSION 3.2.2 )
  2 | project( qTriangle CXX )
  3 | 
  4 | ### Standard
  5 | set( CMAKE_CXX_STANDARD 17 )
  6 | set( CMAKE_CXX_STANDARD_REQUIRED ON )
  7 | set( CMAKE_CXX_EXTENSIONS ON )
  8 | 
  9 | ### Verbosity
 10 | set( CMAKE_COLOR_MAKEFILE ON )
 11 | set( CMAKE_VERBOSE_MAKEFILE ON )
 12 | 
 13 | # Generate 'compile_commands.json' for clang_complete
 14 | set( CMAKE_EXPORT_COMPILE_COMMANDS ON )
 15 | 
 16 | ### Global includes
 17 | include_directories(
 18 | 	include
 19 | )
 20 | 
 21 | ### Optimizations
 22 | if( MSVC )
 23 | 	add_compile_options( /arch:AVX2 )
 24 | 	add_compile_options( /W3 )
 25 | 	add_compile_options( /Gv )
 26 | elseif( CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
 27 | 	add_compile_options( -march=native )
 28 | 	add_compile_options( -Ofast )
 29 | 	add_compile_options( -Wall )
 30 | 	add_compile_options( -Wextra )
 31 | 	# Force colored diagnostic messages in Ninja's output
 32 | 	if( CMAKE_GENERATOR STREQUAL "Ninja" )
 33 | 	    add_compile_options( -fdiagnostics-color=always )
 34 | 	endif()
 35 | endif()
 36 | 
 37 | ## GLM
 38 | set( GLM_TEST_ENABLE OFF CACHE BOOL "Build GLM Unit Tests")
 39 | add_subdirectory( extern/glm )
 40 | 
 41 | ### Target
 42 | add_library(
 43 | 	qTriangle
 44 | 	STATIC
 45 | 	source/qTriangle/qTriangle.cpp
 46 | 	source/qTriangle/Util.cpp
 47 | )
 48 | target_link_libraries(
 49 | 	qTriangle
 50 | 	PRIVATE
 51 | 	glm
 52 | )
 53 | 
 54 | ### Tests
 55 | enable_testing()
 56 | 
 57 | ## Display
 58 | add_executable(
 59 | 	Display
 60 | 	test/Display.cpp
 61 | )
 62 | target_link_libraries(
 63 | 	Display
 64 | 	PRIVATE
 65 | 	qTriangle
 66 | 	glm
 67 | )
 68 | add_test(
 69 | 	NAME Display
 70 | 	COMMAND Display
 71 | )
 72 | 
 73 | ## Benchmark
 74 | add_executable(
 75 | 	Benchmark
 76 | 	test/Benchmark.cpp
 77 | )
 78 | target_link_libraries(
 79 | 	Benchmark
 80 | 	PRIVATE
 81 | 	qTriangle
 82 | 	glm
 83 | )
 84 | add_test(
 85 | 	NAME Benchmark
 86 | 	COMMAND Benchmark
 87 | )
 88 | 
 89 | ## FillShape
 90 | add_executable(
 91 | 	FillShape
 92 | 	test/FillShape.cpp
 93 | )
 94 | target_link_libraries(
 95 | 	FillShape
 96 | 	PRIVATE
 97 | 	qTriangle
 98 | 	glm
 99 | )
100 | add_test(
101 | 	NAME FillShape
102 | 	COMMAND FillShape
103 | )
104 | # Link filesystem libs for GCC/Clang
105 | if( CMAKE_COMPILER_IS_GNUCXX )
106 | 	target_link_libraries(
107 | 		FillShape
108 | 		PRIVATE
109 | 		stdc++fs
110 | 	)
111 | endif()
112 | 


--------------------------------------------------------------------------------
/source/qTriangle/qTriangle.cpp:
--------------------------------------------------------------------------------
  1 | #include <qTriangle/qTriangle.hpp>
  2 | 
  3 | #define GLM_ENABLE_EXPERIMENTAL
  4 | #include <glm/glm.hpp>
  5 | #include <glm/gtx/component_wise.hpp>
  6 | 
  7 | namespace qTri
  8 | {
  9 | 
 10 | // Get Cross-Product Z component from two directional vectors
 11 | inline std::int32_t Det(
 12 | 	const glm::i32vec2& Top,
 13 | 	const glm::i32vec2& Bottom
 14 | )
 15 | {
 16 | 	return Top.x * Bottom.y - Top.y * Bottom.x;
 17 | }
 18 | 
 19 | //// Cross Product Method
 20 | 
 21 | template<std::uint8_t WidthExp2>
 22 | inline void CrossProductMethod(
 23 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
 24 | 	const Triangle& Tri
 25 | )
 26 | {
 27 | 	CrossProductMethod<WidthExp2-1>(
 28 | 		Points, Results, Count,
 29 | 		Tri
 30 | 	);
 31 | }
 32 | 
 33 | //// Barycentric Method
 34 | 
 35 | template<std::uint8_t WidthExp2>
 36 | inline void BarycentricMethod(
 37 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
 38 | 	const Triangle& Tri
 39 | )
 40 | {
 41 | 	BarycentricMethod<WidthExp2-1>(
 42 | 		Points, Results, Count,
 43 | 		Tri
 44 | 	);
 45 | }
 46 | 
 47 | #if defined(__x86_64__) || defined(_M_X64)
 48 | #include "qTriangle-x86.hpp"
 49 | #else
 50 | template<>
 51 | inline void CrossProductMethod<0>(
 52 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
 53 | 	const Triangle& Tri
 54 | )
 55 | {
 56 | 	// Directional vectors along all three triangle edges
 57 | 	const glm::i32vec2 EdgeDir[3] = {
 58 | 		Tri[1] - Tri[0],
 59 | 		Tri[2] - Tri[1],
 60 | 		Tri[0] - Tri[2]
 61 | 	};
 62 | 
 63 | 	for( std::size_t i = 0; i < Count; ++i )
 64 | 	{
 65 | 		const glm::i32vec2 PointDir[3] = {
 66 | 			Points[i] - Tri[0],
 67 | 			Points[i] - Tri[1],
 68 | 			Points[i] - Tri[2]
 69 | 		};
 70 | 
 71 | 		const glm::i32vec3 Crosses = glm::vec3(
 72 | 			Det( EdgeDir[0], PointDir[0] ),
 73 | 			Det( EdgeDir[1], PointDir[1] ),
 74 | 			Det( EdgeDir[2], PointDir[2] )
 75 | 		);
 76 | 
 77 | 		Results[i] |= glm::all(
 78 | 			glm::greaterThanEqual(
 79 | 				Crosses,
 80 | 				glm::i32vec3(0)
 81 | 			)
 82 | 		);
 83 | 	}
 84 | }
 85 | 
 86 | template<>
 87 | inline void BarycentricMethod<0>(
 88 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
 89 | 	const Triangle& Tri
 90 | )
 91 | {
 92 | 	const std::int32_t Det01 = Det( Tri[0], Tri[1] );
 93 | 	const std::int32_t Det20 = Det( Tri[2], Tri[0] );
 94 | 	const std::int32_t Area  = Det( Tri[1], Tri[2] ) + Det20 + Det01;
 95 | 
 96 | 	for( std::size_t i = 0; i < Count; ++i )
 97 | 	{
 98 | 		const std::int32_t U = Det20
 99 | 			+ Det(    Tri[0], Points[i] )
100 | 			+ Det( Points[i],    Tri[2] );
101 | 		const std::int32_t V = Det01
102 | 			+ Det(    Tri[1], Points[i] )
103 | 			+ Det( Points[i],    Tri[0] );
104 | 
105 | 		Results[i] |= (U + V) < Area && U >= 0 && V >= 0;
106 | 	}
107 | }
108 | #endif
109 | 
110 | //// Exports
111 | 
112 | const std::vector<
113 | 	std::pair<
114 | 		void(* const)(
115 | 			const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
116 | 			const Triangle& Tri
117 | 		),
118 | 		const char*
119 | 	>
120 | > FillAlgorithms = {
121 | 	// Cross-Product methods
122 | 	{CrossProductMethod<  0>,	"Serial-CrossProduct"},
123 | 	{CrossProductMethod< -1>,	"CrossProductMethod"},
124 | 	// Barycentric methods
125 | 	{BarycentricMethod<  0>,	"Serial-Barycentric"},
126 | 	{BarycentricMethod< -1>,	"BarycentricMethod"},
127 | };
128 | }
129 | 
130 | 


--------------------------------------------------------------------------------
/test/Display.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <cstdint>
  3 | #include <cstdlib>
  4 | #include <type_traits>
  5 | #include <algorithm>
  6 | #include <random>
  7 | 
  8 | #include <glm/glm.hpp>
  9 | #include <glm/gtc/constants.hpp>
 10 | #include <glm/trigonometric.hpp>
 11 | 
 12 | #include <qTriangle/qTriangle.hpp>
 13 | 
 14 | #include "Bench.hpp"
 15 | 
 16 | #ifdef _WIN32
 17 | #define NOMINMAX
 18 | #include <Windows.h>
 19 | // Statically enables "ENABLE_VIRTUAL_TERMINAL_PROCESSING" for the terminal
 20 | // at runtime to allow for unix-style escape sequences. 
 21 | static const bool _WndV100Enabled = []() -> bool
 22 | 	{
 23 | 		const auto Handle = GetStdHandle(STD_OUTPUT_HANDLE);
 24 | 		DWORD ConsoleMode;
 25 | 		GetConsoleMode(
 26 | 			Handle,
 27 | 			&ConsoleMode
 28 | 		);
 29 | 		SetConsoleMode(
 30 | 			Handle,
 31 | 			ConsoleMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING
 32 | 		);
 33 | 		GetConsoleMode(
 34 | 			Handle,
 35 | 			&ConsoleMode
 36 | 		);
 37 | 		return ConsoleMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING;
 38 | 	}();
 39 | #endif
 40 | 
 41 | constexpr std::size_t Width = 80;
 42 | constexpr std::size_t Height = 50;
 43 | 
 44 | static qTri::Triangle Triangles[12];
 45 | 
 46 | int main()
 47 | {
 48 | 	// Generate random triangles
 49 | 	std::random_device RandomDevice;
 50 | 	std::mt19937 RandomEngine(RandomDevice());
 51 | 	std::uniform_int_distribution<std::int32_t> WidthDis(0, Width);
 52 | 	std::uniform_int_distribution<std::int32_t> HeightDis(0, Height);
 53 | 	for( qTri::Triangle& CurTriangle : Triangles )
 54 | 	{
 55 | 		glm::i32vec2 Center{};
 56 | 		// Randomly place vertices
 57 | 		for( glm::i32vec2& CurVert : CurTriangle )
 58 | 		{
 59 | 			CurVert.x = WidthDis(RandomEngine);
 60 | 			CurVert.y = HeightDis(RandomEngine);
 61 | 			Center += CurVert;
 62 | 		}
 63 | 		// Sort points in clockwise order
 64 | 		Center /= 3;
 65 | 		std::sort(
 66 | 			std::begin(CurTriangle),
 67 | 			std::end(CurTriangle),
 68 | 			[&Center](const glm::i32vec2& A, const glm::i32vec2& B) -> bool
 69 | 				{
 70 | 					// Points that have a larger angle away from the center are "heavier"
 71 | 					const glm::i32vec2 DirectionA = Center - A;
 72 | 					const glm::i32vec2 DirectionB = Center - B;
 73 | 					const auto AngleA = glm::atan<glm::float32_t>(DirectionA.y, DirectionA.x);
 74 | 					const auto AngleB = glm::atan<glm::float32_t>(DirectionB.y, DirectionB.x);
 75 | 					return AngleA < AngleB;
 76 | 				}
 77 | 		);
 78 | 	}
 79 | 	
 80 | 	// Generate 2d grid of points to test against
 81 | 	std::vector<glm::i32vec2> FragCoords;
 82 | 	for( std::size_t y = 0; y < Height; ++y )
 83 | 	{
 84 | 		for( std::size_t x = 0; x < Width; ++x )
 85 | 		{
 86 | 			FragCoords.emplace_back(x,y);
 87 | 		}
 88 | 	}
 89 | 
 90 | 	for( const auto& FillAlgorithm : qTri::FillAlgorithms )
 91 | 	{
 92 | 		std::printf(
 93 | 			"%s - ",
 94 | 			FillAlgorithm.second
 95 | 		);
 96 | 		qTri::Image CurFrame(Width, Height);
 97 | 		std::size_t ExecTime = 0;
 98 | 		for( const qTri::Triangle& CurTriangle : Triangles )
 99 | 		{
100 | 			ExecTime += Bench<>::Duration(
101 | 				FillAlgorithm.first,
102 | 				FragCoords.data(),
103 | 				CurFrame.Pixels.data(),
104 | 				FragCoords.size(),
105 | 				CurTriangle
106 | 			).count();
107 | 		}
108 | 		ExecTime /= std::extent<decltype(Triangles)>::value;
109 | 		std::printf(
110 | 			"%zu ns\n",
111 | 			ExecTime
112 | 		);
113 | 		qTri::Util::Draw(CurFrame);
114 | 	}
115 | 
116 | 	return EXIT_SUCCESS;
117 | }
118 | 


--------------------------------------------------------------------------------
/test/Benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <cstdint>
  3 | #include <cstdlib>
  4 | #include <type_traits>
  5 | #include <algorithm>
  6 | #include <random>
  7 | 
  8 | #include <glm/glm.hpp>
  9 | 
 10 | #include <qTriangle/qTriangle.hpp>
 11 | 
 12 | #include "Bench.hpp"
 13 | 
 14 | #ifdef _WIN32
 15 | #define NOMINMAX
 16 | #include <Windows.h>
 17 | // Statically enables "ENABLE_VIRTUAL_TERMINAL_PROCESSING" for the terminal
 18 | // at runtime to allow for unix-style escape sequences. 
 19 | static const bool _WndV100Enabled = []() -> bool
 20 | 	{
 21 | 		const auto Handle = GetStdHandle(STD_OUTPUT_HANDLE);
 22 | 		DWORD ConsoleMode;
 23 | 		GetConsoleMode(
 24 | 			Handle,
 25 | 			&ConsoleMode
 26 | 		);
 27 | 		SetConsoleMode(
 28 | 			Handle,
 29 | 			ConsoleMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING
 30 | 		);
 31 | 		GetConsoleMode(
 32 | 			Handle,
 33 | 			&ConsoleMode
 34 | 		);
 35 | 		return ConsoleMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING;
 36 | 	}();
 37 | #endif
 38 | 
 39 | constexpr std::size_t Width = 80;
 40 | constexpr std::size_t Height = 50;
 41 | constexpr std::size_t Loops = 5;
 42 | static qTri::Triangle Triangles[100'000];
 43 | 
 44 | int main()
 45 | {
 46 | 	// Generate random triangles
 47 | 	std::random_device RandomDevice;
 48 | 	std::mt19937 RandomEngine(RandomDevice());
 49 | 	std::uniform_int_distribution<std::int32_t> WidthDis(0, Width);
 50 | 	std::uniform_int_distribution<std::int32_t> HeightDis(0, Height);
 51 | 	for( qTri::Triangle& CurTriangle : Triangles )
 52 | 	{
 53 | 		glm::i32vec2 Center{};
 54 | 		// Randomly place vertices
 55 | 		for( glm::i32vec2& CurVert : CurTriangle )
 56 | 		{
 57 | 			CurVert.x = WidthDis(RandomEngine);
 58 | 			CurVert.y = HeightDis(RandomEngine);
 59 | 			Center += CurVert;
 60 | 		}
 61 | 		// Sort points in clockwise order
 62 | 		Center /= 3;
 63 | 		std::sort(
 64 | 			std::begin(CurTriangle),
 65 | 			std::end(CurTriangle),
 66 | 			[&Center](const glm::i32vec2& A, const glm::i32vec2& B) -> bool
 67 | 				{
 68 | 					// Sort points by its angle from the center
 69 | 					const glm::i32vec2 DirectionA = Center - A;
 70 | 					const glm::i32vec2 DirectionB = Center - B;
 71 | 					const auto AngleA = glm::atan<glm::float32_t>(DirectionA.y, DirectionA.x);
 72 | 					const auto AngleB = glm::atan<glm::float32_t>(DirectionB.y, DirectionB.x);
 73 | 					return AngleA < AngleB;
 74 | 				}
 75 | 		);
 76 | 	}
 77 | 	std::printf(
 78 | 		"%zu Triangles x %zu times\n"
 79 | 		"%zu x %zu Image map\n",
 80 | 		std::extent<decltype(Triangles)>::value,
 81 | 		Loops,
 82 | 		Width,
 83 | 		Height
 84 | 	);
 85 | 	std::printf(
 86 | 		"Algorithm | Average per triangle(ns)\n"
 87 | 	);
 88 | 	// Generate 2d grid of points to test against
 89 | 	std::vector<glm::i32vec2> FragCoords;
 90 | 	for( std::size_t y = 0; y < Height; ++y )
 91 | 	{
 92 | 		for( std::size_t x = 0; x < Width; ++x )
 93 | 		{
 94 | 			FragCoords.emplace_back(x,y);
 95 | 		}
 96 | 	}
 97 | 	// Benchmark each algorithm against all triangles
 98 | 	for( const auto& FillAlgorithm : qTri::FillAlgorithms )
 99 | 	{
100 | 		std::printf(
101 | 			"%s\t",
102 | 			FillAlgorithm.second
103 | 		);
104 | 		qTri::Image CurFrame(Width, Height);
105 | 		std::size_t ExecTime = 0;
106 | 		for( std::size_t i = 0; i < Loops; ++i)
107 | 		{
108 | 			for( const qTri::Triangle& CurTriangle : Triangles )
109 | 			{
110 | 				ExecTime += Bench<>::Duration(
111 | 					FillAlgorithm.first,
112 | 					FragCoords.data(),
113 | 					CurFrame.Pixels.data(),
114 | 					FragCoords.size(),
115 | 					CurTriangle
116 | 				).count();
117 | 			}
118 | 		}
119 | 		ExecTime /= std::extent<decltype(Triangles)>::value * Loops;
120 | 		std::printf(
121 | 			"| %zu ns\n",
122 | 			ExecTime
123 | 		);
124 | 	}
125 | 	return EXIT_SUCCESS;
126 | }
127 | 


--------------------------------------------------------------------------------
/scripts/GenGifs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import multiprocessing
  4 | import numpy as np
  5 | import itertools
  6 | from PIL import Image
  7 | from PIL import ImageDraw
  8 | 
  9 | TestTriangle = (
 10 | 	( 5, 5),
 11 | 	(95,40),
 12 | 	(30,95)
 13 | )
 14 | 
 15 | # Barycentric method
 16 | def PointInTriangle( Point, Triangle ):
 17 | 	V0 = tuple(np.subtract(Triangle[2], Triangle[0]))
 18 | 	V1 = tuple(np.subtract(Triangle[1], Triangle[0]))
 19 | 	V2 = tuple(np.subtract(Point      , Triangle[0]))
 20 | 
 21 | 	Dot00 = np.dot(V0, V0)
 22 | 	Dot01 = np.dot(V0, V1)
 23 | 	Dot02 = np.dot(V0, V2)
 24 | 	Dot11 = np.dot(V1, V1)
 25 | 	Dot12 = np.dot(V1, V2)
 26 | 	Area  = (Dot00 * Dot11 - Dot01 * Dot01)
 27 | 	U     = (Dot11 * Dot02 - Dot01 * Dot12)
 28 | 	V     = (Dot00 * Dot12 - Dot01 * Dot02)
 29 | 	return (U >= 0) & (V >= 0) & (U + V < Area)
 30 | 
 31 | def chunks(List, Widths):
 32 | 	i = 0
 33 | 	for CurWidth in Widths:
 34 | 		while i + CurWidth < len(List):
 35 | 			yield List[i:i + CurWidth]
 36 | 			i += CurWidth
 37 | 
 38 | # Params:
 39 | # Name: Name of generated frames. Default "Serial"
 40 | # Path: Path for output images Default: "./frames/(Name)/"
 41 | # Size: Of the image, default (100,100)
 42 | # Scale: Scaling for the resulting image. Default: 2
 43 | # Granularity: List of widths in which elements may be processed in parallel
 44 | #              Default: [1]
 45 | def RenderTriangle( Params ):
 46 | 	# Params
 47 | 	Name = Params.get("Name", "Serial")
 48 | 	Size = Params.get("Size", (100, 100))
 49 | 	Path = "./frames/" + Name + "/"
 50 | 	Scale = Params.get("Scale", 2)
 51 | 	Granularity = Params.get("Granularity", [1])
 52 | 	# Sort by largest to smallest
 53 | 	Granularity.sort()
 54 | 	Granularity.reverse()
 55 | 	# Create target path recursively
 56 | 	os.makedirs(Path, exist_ok=True)
 57 | 	# Create image
 58 | 	Img = Image.new('RGB', Size)
 59 | 	Draw = ImageDraw.Draw(Img)
 60 | 	# Generate each row of points up-front
 61 | 	Points = [
 62 | 		(x,y) for y in range(Size[1]) for x in range(Size[0])
 63 | 	]
 64 | 	i = 0
 65 | 	for CurPoints in chunks(Points,Granularity):
 66 | 		# Hilight the pixels being currently processed
 67 | 		# Hilight hits and misses
 68 | 		Hit = [(x,y) for (x,y) in CurPoints if PointInTriangle((x,y),TestTriangle)]
 69 | 		Miss = [(x,y) for (x,y) in CurPoints if not PointInTriangle((x,y),TestTriangle)]
 70 | 		Draw.point(
 71 | 			Hit,
 72 | 			fill=(0x00, 0xFF, 0x00)
 73 | 		)
 74 | 		Draw.point(
 75 | 			Miss,
 76 | 			fill=(0xFF, 0x00, 0x00)
 77 | 		)
 78 | 		Img.resize(
 79 | 			(Img.width * Scale, Img.height * Scale),
 80 | 			Image.NEAREST
 81 | 		).save(Path + Name + '_' + str(i).zfill(6) + ".png")
 82 | 		i += 1
 83 | 		# Save the "processed" frame
 84 | 		Draw.point(
 85 | 			Hit,
 86 | 			fill=(0xFF, 0xFF, 0xFF)
 87 | 		)
 88 | 		Draw.point(
 89 | 			Miss,
 90 | 			fill=(0x00, 0x00, 0x00)
 91 | 		)
 92 | 		Img.resize(
 93 | 			(Img.width * Scale, Img.height * Scale),
 94 | 			Image.NEAREST
 95 | 		).save(Path + Name + '_' + str(i).zfill(6) + ".png")
 96 | 		i += 1
 97 | 	subprocess.Popen(
 98 | 		[
 99 | 			'ffmpeg',
100 | 			'-y',
101 | 			'-framerate','50',
102 | 			'-i', Path + Name + '_%06d.png',
103 | 			Name + '.gif'
104 | 		]
105 | 	).wait()
106 | 
107 | Configs = [
108 | # Serial
109 | 	{
110 | 		"Name": "Serial",
111 | 		"Granularity": [1],
112 | 		"Scale": 2,
113 | 		"Size": (100, 100)
114 | 	},
115 | # SSE/NEON
116 | 	{
117 | 		"Name": "SSE-NEON",
118 | 		"Granularity": [4,1],
119 | 		"Scale": 2,
120 | 		"Size": (100, 100)
121 | 	},
122 | # AVX2
123 | 	{
124 | 		"Name": "AVX2",
125 | 		"Granularity": [8,4,1],
126 | 		"Scale": 2,
127 | 		"Size": (100, 100)
128 | 	},
129 | # AVX512
130 | 	{
131 | 		"Name": "AVX512",
132 | 		"Granularity": [16,8,4,1],
133 | 		"Scale": 2,
134 | 		"Size": (100, 100)
135 | 	}
136 | ]
137 | 
138 | Processes = [
139 | 	multiprocessing.Process(
140 | 		target=RenderTriangle, args=(Config,)
141 | 	) for Config in Configs
142 | ]
143 | 
144 | for Process in Processes:
145 | 	Process.start()
146 | 
147 | for Process in Processes:
148 | 	Process.join()
149 | 


--------------------------------------------------------------------------------
/test/FillShape.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <cstdint>
  3 | #include <cstdlib>
  4 | #include <type_traits>
  5 | #include <algorithm>
  6 | #include <numeric>
  7 | #include <experimental/filesystem>
  8 | namespace fs = std::experimental::filesystem;
  9 | 
 10 | #include <glm/glm.hpp>
 11 | #include <glm/gtc/constants.hpp>
 12 | #include <glm/trigonometric.hpp>
 13 | 
 14 | #include <qTriangle/qTriangle.hpp>
 15 | 
 16 | #define STB_IMAGE_WRITE_IMPLEMENTATION
 17 | #include "stb_image_write.h"
 18 | 
 19 | constexpr std::size_t Width = 300;
 20 | constexpr std::size_t Height = 300;
 21 | 
 22 | int main()
 23 | {
 24 | 	// "a"
 25 | 	const glm::i32vec2 Edges[] = {
 26 | 		glm::i32vec2{235,152},
 27 | 		glm::i32vec2{176,165},
 28 | 		glm::i32vec2{144,173},
 29 | 		glm::i32vec2{129,184},
 30 | 		glm::i32vec2{124,202},
 31 | 		glm::i32vec2{135,226},
 32 | 		glm::i32vec2{168,235},
 33 | 		glm::i32vec2{206,226},
 34 | 		glm::i32vec2{230,201},
 35 | 		glm::i32vec2{236,165},
 36 | 		glm::i32vec2{235,152},
 37 | 
 38 | 		glm::i32vec2{238,233},
 39 | 		glm::i32vec2{200,257},
 40 | 		glm::i32vec2{159,264},
 41 | 		glm::i32vec2{105,247},
 42 | 		glm::i32vec2{86,203},
 43 | 		glm::i32vec2{93,174},
 44 | 		glm::i32vec2{112,153},
 45 | 		glm::i32vec2{138,141},
 46 | 		glm::i32vec2{171,136},
 47 | 		glm::i32vec2{236,123},
 48 | 		glm::i32vec2{236,114},
 49 | 		glm::i32vec2{226,82},
 50 | 		glm::i32vec2{184,70},
 51 | 		glm::i32vec2{146,79},
 52 | 		glm::i32vec2{128,111},
 53 | 		glm::i32vec2{92,106},
 54 | 		glm::i32vec2{108,69},
 55 | 		glm::i32vec2{140,48},
 56 | 		glm::i32vec2{189,40},
 57 | 		glm::i32vec2{234,47},
 58 | 		glm::i32vec2{259,63},
 59 | 		glm::i32vec2{270,87},
 60 | 		glm::i32vec2{272,121},
 61 | 		glm::i32vec2{272,169},
 62 | 		glm::i32vec2{274,233},
 63 | 		glm::i32vec2{283,259},
 64 | 		glm::i32vec2{246,259},
 65 | 		glm::i32vec2{238,233},
 66 | 	};
 67 | 
 68 | 	qTri::Triangle Triangles[std::extent<decltype(Edges)>::value];
 69 | 
 70 | 	for( std::size_t i = 0; i < std::extent<decltype(Edges)>::value; ++i )
 71 | 	{
 72 | 		Triangles[i] = qTri::Triangle{
 73 | 			{
 74 | 				{0, 0},
 75 | 				Edges[i],
 76 | 				Edges[(i + 1) % std::extent<decltype(Edges)>::value]
 77 | 			}
 78 | 		};
 79 | 		const glm::i32vec2 Center = std::accumulate(
 80 | 			std::cbegin(Triangles[i]),
 81 | 			std::cend(Triangles[i]),
 82 | 			glm::i32vec2{0,0}
 83 | 		) / 3;
 84 | 		std::sort(
 85 | 			std::begin(Triangles[i]),
 86 | 			std::end(Triangles[i]),
 87 | 			[&Center](const glm::i32vec2& A, const glm::i32vec2& B) -> bool
 88 | 				{
 89 | 					// Sort points by its angle from the center
 90 | 					const glm::i32vec2 DirectionA = Center - A;
 91 | 					const glm::i32vec2 DirectionB = Center - B;
 92 | 					const auto AngleA = glm::atan<glm::float32_t>(DirectionA.y, DirectionA.x);
 93 | 					const auto AngleB = glm::atan<glm::float32_t>(DirectionB.y, DirectionB.x);
 94 | 					return AngleA < AngleB;
 95 | 				}
 96 | 		);
 97 | 	}
 98 | 	// Generate 2d grid of points to test against
 99 | 	std::vector<glm::i32vec2> FragCoords;
100 | 	for( std::size_t y = 0; y < Height; ++y )
101 | 	{
102 | 		for( std::size_t x = 0; x < Width; ++x )
103 | 		{
104 | 			FragCoords.emplace_back(x,y);
105 | 		}
106 | 	}
107 | 
108 | 	for( const auto& FillAlgorithm : qTri::FillAlgorithms )
109 | 	{
110 | 		std::printf(
111 | 			"%s:\n",
112 | 			FillAlgorithm.second
113 | 		);
114 | 		const auto FrameFolder = fs::path("Frames") / FillAlgorithm.second;
115 | 		fs::create_directories(FrameFolder);
116 | 		qTri::Image Frame(Width, Height);
117 | 		std::size_t FrameIdx = 0;
118 | 		for( const qTri::Triangle& CurTriangle : Triangles )
119 | 		{
120 | 			qTri::Image CurInversion(Width, Height);
121 | 			// Render triangle to inversion mask
122 | 			FillAlgorithm.first(
123 | 				FragCoords.data(),
124 | 				CurInversion.Pixels.data(),
125 | 				FragCoords.size(),
126 | 				CurTriangle
127 | 			);
128 | 
129 | 			// Append inversion mask
130 | 			for( std::size_t i = 0; i < Width * Height; ++i )
131 | 			{
132 | 				Frame.Pixels[i] = CurInversion.Pixels[i] ? ~Frame.Pixels[i] : Frame.Pixels[i];
133 | 			}
134 | 			stbi_write_png(
135 | 				((FrameFolder / std::to_string(FrameIdx)).string() + ".png").c_str(),
136 | 				Width,
137 | 				Height,
138 | 				1,
139 | 				Frame.Pixels.data(),
140 | 				0
141 | 			);
142 | 			
143 | 			// Write an image of the current triangle
144 | 			// Post-process from [0x00,0x01] to [0x00,0xFF]
145 | 			// Compiler vectorization loves loops like this
146 | 			for( std::size_t i = 0; i < Width * Height; ++i )
147 | 			{
148 | 				CurInversion.Pixels[i] *= 0xFF;
149 | 			}
150 | 			stbi_write_png(
151 | 				(FrameFolder / ("Tri" + std::to_string(FrameIdx) + ".png")).c_str(),
152 | 				Width,
153 | 				Height,
154 | 				1,
155 | 				CurInversion.Pixels.data(),
156 | 				0
157 | 			);
158 | 			// ffmpeg -f image2 -framerate 2 -i %d.png -vf "scale=iw*2:ih*2" -sws_flags neighbor Anim.gif
159 | 			++FrameIdx;
160 | 		}
161 | 	}
162 | 
163 | 	return EXIT_SUCCESS;
164 | }
165 | 


--------------------------------------------------------------------------------
/source/qTriangle/qTriangle-x86.hpp:
--------------------------------------------------------------------------------
  1 | #include <qTriangle/qTriangle.hpp>
  2 | #include <x86intrin.h>
  3 | 
  4 | template<std::uint8_t WidthExp2>
  5 | inline void CrossProductMethod(
  6 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
  7 | 	const qTri::Triangle& Tri
  8 | );
  9 | template<std::uint8_t WidthExp2>
 10 | inline void BarycentricMethod(
 11 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
 12 | 	const qTri::Triangle& Tri
 13 | );
 14 | 
 15 | #if defined(__SSE4_1__)
 16 | 
 17 | // Serial
 18 | template<>
 19 | inline void CrossProductMethod<0>(
 20 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
 21 | 	const Triangle& Tri
 22 | )
 23 | {
 24 | 	// [ Tri[1].y, Tri[1].x, Tri[0].y, Tri[0].x]
 25 | 	const __m128i Tri10 = _mm_loadu_si128(
 26 | 		reinterpret_cast<const __m128i*>(Tri.data())
 27 | 	);
 28 | 	// [ Tri[2].y, Tri[2].x, Tri[2].y, Tri[2].x]
 29 | 	const __m128i Tri22 = _mm_set1_epi64x(
 30 | 		*reinterpret_cast<const std::uint64_t*>(Tri.data() + 2)
 31 | 	);
 32 | 
 33 | 	// unpacklo(above)
 34 | 	// [ Tri[2].y, Tri[0].y, Tri[2].x, Tri[0].x]
 35 | 	// unpackhi(above)
 36 | 	// [ Tri[2].y, Tri[1].y, Tri[2].x, Tri[1].x]
 37 | 	const __m128i Tri20yyxx = _mm_unpacklo_epi32(
 38 | 		Tri10, Tri22
 39 | 	);
 40 | 	const __m128i Tri21yyxx = _mm_unpackhi_epi32(
 41 | 		Tri10, Tri22
 42 | 	);
 43 | 
 44 | 	// unpacklo(above)
 45 | 	// [ Tri[2].x, Tri[2].x, Tri[1].x, Tri[0].x]
 46 | 	// unpackhi(above)
 47 | 	// [ Tri[2].y, Tri[2].y, Tri[1].y, Tri[0].y]
 48 | 	const __m128i Tri2210x = _mm_unpacklo_epi32(
 49 | 		Tri20yyxx, Tri21yyxx
 50 | 	);
 51 | 	const __m128i Tri2210y = _mm_unpackhi_epi32(
 52 | 		Tri20yyxx, Tri21yyxx
 53 | 	);
 54 | 
 55 | 	// [ Tri[2].x, Tri[2].x, Tri[1].x, Tri[0].x]
 56 | 	// - 
 57 | 	// [ Tri[2].x, Tri[1].x, Tri[0].x, Tri[2].x]
 58 | 	//   ^ alignr_epi8([ Tri[2].x, Tri[2].x, Tri[1].x, Tri[0].x],12)
 59 | 	const __m128i EdgeDirx = _mm_sub_epi32(
 60 | 		Tri2210x,
 61 | 		_mm_alignr_epi8(
 62 | 			Tri2210x,Tri2210x,
 63 | 			12
 64 | 		)
 65 | 	);
 66 | 	// [ Tri[2].y, Tri[2].y, Tri[1].y, Tri[0].y]
 67 | 	// - 
 68 | 	// [ Tri[2].y, Tri[1].y, Tri[0].y, Tri[2].y]
 69 | 	//   ^ alignr_epi8([ Tri[2].y, Tri[2].y, Tri[1].y, Tri[0].y],12)
 70 | 	const __m128i EdgeDiry = _mm_sub_epi32(
 71 | 		Tri2210y,
 72 | 		_mm_alignr_epi8(
 73 | 			Tri2210y,Tri2210y,
 74 | 			12
 75 | 		)
 76 | 	);
 77 | 
 78 | 	for( std::size_t i = 0; i < Count; ++i )
 79 | 	{
 80 | 		const __m128i CurPoint = _mm_loadl_epi64(
 81 | 			reinterpret_cast<const __m128i*>(&Points[i])
 82 | 		);
 83 | 		const __m128i CurPointx = _mm_shuffle_epi32(
 84 | 			CurPoint, 0b00'00'00'00
 85 | 		);
 86 | 		const __m128i CurPointy = _mm_shuffle_epi32(
 87 | 			CurPoint, 0b01'01'01'01
 88 | 		);
 89 | 		// PointDirx = Point[i].x - Tri2210x
 90 | 		// PointDiry = Point[i].y - Tri2210y
 91 | 
 92 | 		const __m128i PointDirx = _mm_sub_epi32(
 93 | 			CurPointx, Tri2210x
 94 | 		);
 95 | 		const __m128i PointDiry = _mm_sub_epi32(
 96 | 			CurPointy, Tri2210y
 97 | 		);
 98 | 		// |   --   |  EdgeDir[2].x |  EdgeDir[1].x |  EdgeDir[0].x | < EdgeDirX
 99 | 		//                     |    mul    |
100 | 		// |   --   | PointDir[2].y | PointDir[1].y | PointDir[0].y | < PointDiry
101 | 		//                     |    sub    |
102 | 		// |   --   |  EdgeDir[2].y |  EdgeDir[1].y |  EdgeDir[0].y | < EdgeDiry
103 | 		//                     |    mul    |
104 | 		// |   --   | PointDir[2].x | PointDir[1].x | PointDir[0].x | < PointDirx
105 | 
106 | 		// We're only checking if the signs are >=0 so there is a lot of
107 | 		// optimization that can be done, such as eliminating the subtraction
108 | 		// in the determinant to just comparing the two products directly
109 | 		// ex: a.x*b.y - a.y*b.x >= 0
110 | 		//     a.x*b.y >= a.y*b.x
111 | 		// DetHi = EdgeDirx * PointDiry
112 | 		// DetLo = EdgeDiry * PointDirx
113 | 		const __m128i DetHi = _mm_mullo_epi32(
114 | 			EdgeDirx, PointDiry
115 | 		);
116 | 		const __m128i DetLo = _mm_mullo_epi32(
117 | 			EdgeDiry, PointDirx
118 | 		);
119 | 
120 | 		const std::uint16_t CheckMask = _mm_movemask_epi8(
121 | 			_mm_cmplt_epi32(
122 | 				DetHi, DetLo
123 | 			)
124 | 		) & 0x0'F'F'F;
125 | 
126 | 		// Check = DetHi >= DetLo = -(DetHi < DetLo)
127 | 		Results[i] |= CheckMask == 0x0'0'0'0;
128 | 	}
129 | }
130 | 
131 | #if defined(__AVX2__)
132 | 
133 | // Two at a time
134 | template<>
135 | inline void CrossProductMethod<1>(
136 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
137 | 	const Triangle& Tri
138 | )
139 | {
140 | 	// [ Tri[1].y, Tri[1].x, Tri[0].y, Tri[0].x]
141 | 	const __m128i Tri10 = _mm_loadu_si128(
142 | 		reinterpret_cast<const __m128i*>(Tri.data())
143 | 	);
144 | 	// [ Tri[2].y, Tri[2].x, Tri[2].y, Tri[2].x]
145 | 	const __m128i Tri22 = _mm_set1_epi64x(
146 | 		*reinterpret_cast<const std::uint64_t*>(Tri.data() + 2)
147 | 	);
148 | 
149 | 	// unpacklo(above)
150 | 	// [ Tri[2].y, Tri[0].y, Tri[2].x, Tri[0].x]
151 | 	// unpackhi(above)
152 | 	// [ Tri[2].y, Tri[1].y, Tri[2].x, Tri[1].x]
153 | 	const __m128i Tri20yyxx = _mm_unpacklo_epi32(
154 | 		Tri10, Tri22
155 | 	);
156 | 	const __m128i Tri21yyxx = _mm_unpackhi_epi32(
157 | 		Tri10, Tri22
158 | 	);
159 | 
160 | 	// unpacklo(above)
161 | 	// [ Tri[2].x, Tri[2].x, Tri[1].x, Tri[0].x]
162 | 	// unpackhi(above)
163 | 	// [ Tri[2].y, Tri[2].y, Tri[1].y, Tri[0].y]
164 | 	const __m256i Tri2210x2x = _mm256_broadcastsi128_si256(
165 | 		_mm_unpacklo_epi32(
166 | 			Tri20yyxx, Tri21yyxx
167 | 		)
168 | 	);
169 | 	const __m256i Tri2210x2y = _mm256_broadcastsi128_si256(
170 | 			_mm_unpackhi_epi32(
171 | 			Tri20yyxx, Tri21yyxx
172 | 		)
173 | 	);
174 | 
175 | 	// [ Tri[2].x, Tri[2].x, Tri[1].x, Tri[0].x]
176 | 	// - 
177 | 	// [ Tri[2].x, Tri[1].x, Tri[0].x, Tri[2].x]
178 | 	//   ^ alignr_epi8([ Tri[2].x, Tri[2].x, Tri[1].x, Tri[0].x],12)
179 | 	const __m256i EdgeDirx2x =
180 | 	_mm256_sub_epi32(
181 | 			Tri2210x2x,
182 | 			_mm256_alignr_epi8(
183 | 				Tri2210x2x,Tri2210x2x,
184 | 				12
185 | 		)
186 | 	);
187 | 	// [ Tri[2].y, Tri[2].y, Tri[1].y, Tri[0].y]
188 | 	// - 
189 | 	// [ Tri[2].y, Tri[1].y, Tri[0].y, Tri[2].y]
190 | 	//   ^ alignr_epi8([ Tri[2].y, Tri[2].y, Tri[1].y, Tri[0].y],12)
191 | 	const __m256i EdgeDirx2y =
192 | 	_mm256_sub_epi32(
193 | 			Tri2210x2y,
194 | 			_mm256_alignr_epi8(
195 | 				Tri2210x2y,Tri2210x2y,
196 | 				12
197 | 		)
198 | 	);
199 | 
200 | 	for( std::size_t i = 0; i < Count; i += 2 )
201 | 	{
202 | 		const __m256i CurPointx2 = _mm256_permute4x64_epi64(
203 | 			_mm256_castsi128_si256(
204 | 				_mm_loadu_si128(
205 | 					reinterpret_cast<const __m128i*>(&Points[i])
206 | 				)
207 | 			),
208 | 			0b01'01'00'00
209 | 		);
210 | 		const __m256i CurPointx2x = _mm256_shuffle_epi32(
211 | 			CurPointx2, 0b00'00'00'00
212 | 		);
213 | 		const __m256i CurPointx2y = _mm256_shuffle_epi32(
214 | 			CurPointx2, 0b01'01'01'01
215 | 		);
216 | 
217 | 		const __m256i PointDirx2x = _mm256_sub_epi32(
218 | 			CurPointx2x, Tri2210x2x
219 | 		);
220 | 		const __m256i PointDirx2y = _mm256_sub_epi32(
221 | 			CurPointx2y, Tri2210x2y
222 | 		);
223 | 		const __m256i DetHix2 = _mm256_mullo_epi32(
224 | 			EdgeDirx2x, PointDirx2y
225 | 		);
226 | 		const __m256i DetLox2 = _mm256_mullo_epi32(
227 | 			EdgeDirx2y, PointDirx2x
228 | 		);
229 | 		// Check = DetHi >= DetLo = -(DetHi < DetLo) = ~(DetLo > DetHi)
230 | 		const std::uint32_t CheckMaskx2 = (~_mm256_movemask_epi8(
231 | 			_mm256_cmpgt_epi32(
232 | 				DetLox2, DetHix2
233 | 			)
234 | 		) & 0x0'F'F'F'0'F'F'F) + 0x0001'0001;
235 | 
236 | 		*reinterpret_cast<std::uint16_t*>(Results + i) |=
237 | 			static_cast<std::uint16_t>(
238 | 				0x0101'0100'0001'0000 >> (_pext_u32( CheckMaskx2, 0x1000'1000) * 16)
239 | 			);
240 | 		
241 | 		// Results[i + 0] |= (CheckMaskx2 & 0x0000FFFF) == 0;
242 | 		// Results[i + 1] |= (CheckMaskx2 & 0xFFFF0000) == 0;
243 | 	}
244 | 	CrossProductMethod<0>(
245 | 		Points, Results, Count, Tri
246 | 	);
247 | }
248 | #endif
249 | 
250 | template<>
251 | inline void BarycentricMethod<0>(
252 | 	const glm::i32vec2 Points[], std::uint8_t Results[], std::size_t Count,
253 | 	const qTri::Triangle& Tri
254 | )
255 | {
256 | 	// [ Tri[1].y, Tri[1].x, Tri[0].y, Tri[0].x]
257 | 	const __m128i Tri10 = _mm_loadu_si128(
258 | 		reinterpret_cast<const __m128i*>(Tri.data())
259 | 	);
260 | 	// [ Tri[2].y, Tri[2].x, Tri[2].y, Tri[2].x]
261 | 	const __m128i Tri22 = _mm_set1_epi64x(
262 | 		*reinterpret_cast<const std::uint64_t*>(Tri.data() + 2)
263 | 	);
264 | 	// [ Tri[2].y, Tri[2].x, Tri[1].y, Tri[1].x]
265 | 	const __m128i Tri21 = _mm_alignr_epi8(
266 | 		Tri22, Tri10,
267 | 		8
268 | 	);
269 | 
270 | 	// | Tri[1].x | Tri[0].y | Tri[2].y | Tri[0].x |
271 | 	const __m128i ConstVec_1x0y2y0x = _mm_blend_epi32(
272 | 		_mm_shuffle_epi32(Tri10,0b10'01'00'00),// 1x0y__0x
273 | 		Tri22,                                 // ____2y__
274 | 		0b0010
275 | 	);
276 | 
277 | 	// | Tri[1].y | Tri[0].x | Tri[2].x | Tri[0].y |
278 | 	const __m128i ConstVec_1y0x2x0y = _mm_blend_epi32(
279 | 		_mm_shuffle_epi32(Tri10,0b11'00'00'01),// 1y0x__0y
280 | 		_mm_shuffle_epi32(Tri22,0b00'00'00'00),// ____2x__
281 | 		0b0010
282 | 	);
283 | 
284 | 	// Det01: Tri[1].y * Tri[0].x - Tri[0].y * Tri[1].x
285 | 	// Det20: Tri[2].x * Tri[0].y - Tri[0].x * Tri[2].y
286 | 
287 | 	// | Tri[1].y | Tri[2].x | Tri[1].y | Tri[2].x |
288 | 	// |    *     |    *     |    *     |    *     |
289 | 	// | Tri[0].x | Tri[0].y | Tri[0].x | Tri[0].y |
290 | 	// |    -     |    -     |    -     |    -     |
291 | 	// | Tri[0].y | Tri[0].x | Tri[0].y | Tri[0].x |
292 | 	// |    *     |    *     |    *     |    *     |
293 | 	// | Tri[1].x | Tri[2].y | Tri[1].x | Tri[2].y |
294 | 	// [  Det01  ,  Det20    ,  Det01   ,  Det20   ]
295 | 	const __m128i Det0120 = _mm_sub_epi32(
296 | 		_mm_mullo_epi32(
297 | 			_mm_shuffle_epi32(Tri21,0b01'10'01'10),
298 | 			_mm_shuffle_epi32(Tri10,0b00'01'00'01)
299 | 		),
300 | 		_mm_mullo_epi32(
301 | 			_mm_shuffle_epi32(Tri10,0b01'00'01'00),
302 | 			_mm_shuffle_epi32(Tri21,0b00'11'00'11)
303 | 		)
304 | 	);
305 | 
306 | 	// Area: Tri[2].y * Tri[1].x - Tri[2].x * Tri[1].y
307 | 	//       + Det20 + Det01
308 | 
309 | 	// | Tri[2].x | Tri[2].y | Tri[2].x | Tri[2].y |
310 | 	// |    *     |    *     |    *     |    *     |
311 | 	// | Tri[1].y | Tri[1].x | Tri[1].y | Tri[1].x |
312 | 	// |         hsub        |         hsub        |
313 | 	// |    +     |    +     |    +     |    +     |
314 | 	// [  Det01   | Det01    |  Det01   |  Det01   ]
315 | 	// |    +     |    +     |    +     |    +     |
316 | 	// [  Det20   | Det20    |  Det20   |  Det20   ]
317 | 	// [  Area    |  Area    |  Area    |  Area    ]
318 | 	const __m128i AreaProduct = _mm_mullo_epi32(
319 | 		_mm_shuffle_epi32(
320 | 			Tri22, 0b00'01'00'01
321 | 		),
322 | 		_mm_shuffle_epi32(
323 | 			Tri10, 0b11'10'11'10
324 | 		)
325 | 	);
326 | 	const __m128i Area = _mm_add_epi32(
327 | 		_mm_hsub_epi32(
328 | 			AreaProduct, AreaProduct
329 | 		),
330 | 		_mm_hadd_epi32(
331 | 			Det0120,Det0120
332 | 		)
333 | 	);
334 | 
335 | 	// [Area-1, Area-1, 0,0]
336 | 	const __m128i CheckConst = _mm_blend_epi16(
337 | 		_mm_setzero_si128(),
338 | 		_mm_sub_epi32( // Area - 1
339 | 			Area,
340 | 			_mm_set1_epi32(1)
341 | 		),
342 | 		0b11'11'00'00
343 | 	);
344 | 
345 | 	for( std::size_t i = 0; i < Count; ++i )
346 | 	{
347 | 		// YXYX
348 | 		const __m128i Point = _mm_loadl_epi64(
349 | 			reinterpret_cast<const __m128i*>(Points + i)
350 | 		);
351 | 		const __m128i PointYXXY= _mm_shuffle_epi32(
352 | 			Point,
353 | 			0b01'00'00'01
354 | 		);
355 | 		const __m128i PointXYYX = _mm_alignr_epi8(
356 | 			PointYXXY,PointYXXY,8
357 | 		);
358 | 
359 | 		// U:
360 | 		//   Point.y * Tri[0].x - Point.x * Tri[0].y
361 | 		// +
362 | 		//   Point.x * Tri[2].y - Point.y * Tri[2].x
363 | 		// + Det20
364 | 		// V:
365 | 		//   Point.x * Tri[0].y - Point.y * Tri[0].x
366 | 		// +
367 | 		//   Point.y * Tri[1].x - Point.x * Tri[1].y
368 | 		// + Det01
369 | 
370 | 		// If I wanted to do two at a time, I could fit
371 | 		// two UVs into one 128-bit lane. Putting this
372 | 		// here for reference
373 | 		// |  Point.x |  Point.y |  Point.x |  Point.y |
374 | 		// |    *     |    *     |    *     |    *     |
375 | 		// [ Tri[0].y | Tri[0].x | Tri[0].y | Tri[0].x ] < const
376 | 		// |    -     |    -     |    -     |    -     |
377 | 		// |  Point.y |  Point.x |  Point.y |  Point.x |
378 | 		// |    *     |    *     |    *     |    *     |
379 | 		// [ Tri[0].x | Tri[0].y | Tri[0].x | Tri[0].y ] < const
380 | 		// |    +     |    +     |    +     |    +     |
381 | 		// |  Point.y |  Point.x |  Point.y |  Point.x |
382 | 		// |    *     |    *     |    *     |    *     |
383 | 		// [ Tri[1].x | Tri[2].y | Tri[1].x | Tri[2].y ] < const
384 | 		// |    -     |    -     |    -     |    -     |
385 | 		// |  Point.x |  Point.y |  Point.x |  Point.y |
386 | 		// |    *     |    *     |    *     |    *     |
387 | 		// [ Tri[1].y | Tri[2].x | Tri[1].y | Tri[2].x ] < const
388 | 		// |    +     |    +     |    +     |    +     |
389 | 		// [  Det01   |  Det20   |  Det01   |  Det20   ] < const
390 | 		// |    V1    |    U1    |    V0    |    U0    |
391 | 
392 | 		// If I wanted to do 1 at a time though,
393 | 		// I could utilize more lanes to do
394 | 		// independent calculations in parallel
395 | 		// Such as the adds and multplications
396 | 
397 | 		// |  Point.x |  Point.y | xy
398 | 		// |    *     |    *     |
399 | 		// | Tri[0].y | Tri[0].x ] < const
400 | 		// |    -     |    -     |
401 | 		// |  Point.y |  Point.x | yx
402 | 		// |    *     |    *     |
403 | 		// | Tri[0].x | Tri[0].y ] < const
404 | 		// |    +     |    +     |
405 | 		// |  Point.y |  Point.x | yx
406 | 		// |    *     |    *     |
407 | 		// | Tri[1].x | Tri[2].y ] < const
408 | 		// |    -     |    -     |
409 | 		// |  Point.x |  Point.y | xy
410 | 		// |    *     |    *     |
411 | 		// | Tri[1].y | Tri[2].x ] < const
412 | 		// |    +     |    +     |
413 | 		// |  Det01   |  Det20   ] < const
414 | 		// |    V     |    U     |
415 | 		//
416 | 		// V Utilizing all four lanes V
417 | 		// !Four determinants at once!
418 | 		// |  Point.y |  Point.x |  Point.x |  Point.y | yxxy
419 | 		// |    *     |    *     |    *     |    *     | mul
420 | 		// | Tri[1].x | Tri[0].y | Tri[2].y | Tri[0].x | < const
421 | 		// |    -     |    -     |    -     |    -     | sub
422 | 		// |  Point.x |  Point.y |  Point.y |  Point.x | xyyx
423 | 		// |    *     |    *     |    *     |    *     | mul
424 | 		// | Tri[1].y | Tri[0].x | Tri[2].x | Tri[0].y | < const
425 | 		// |         hadd        |         hadd        | hadd
426 | 		// |    +     |    +     |    +     |    +     | add
427 | 		// [  Det01   |  Det20   |  Det01   |  Det20   ] < const
428 | 		// |    V     |    U     |    V     |    U     |
429 | 
430 | 		__m128i VU = _mm_sub_epi32(
431 | 			_mm_mullo_epi32(
432 | 				PointYXXY,
433 | 				ConstVec_1x0y2y0x //1x'0y'2y'0x
434 | 			),
435 | 			_mm_mullo_epi32(
436 | 				PointXYYX,
437 | 				ConstVec_1y0x2x0y //1y'0x'2x'0y
438 | 			)
439 | 		);
440 | 		VU = _mm_add_epi32(
441 | 			_mm_hadd_epi32(
442 | 				VU, VU
443 | 			),
444 | 			Det0120
445 | 		);
446 | 		// Area = (blah) + Det20 + Det01
447 | 		// U + V < Area ; U + V <= Area - 1
448 | 		// U + V - Area < 0
449 | 		// const auto AreaCheck = _mm_cmplt_epi32(
450 | 		// 	_mm_hadd_epi32(
451 | 		// 		VU, VU
452 | 		// 	),
453 | 		// 	Area
454 | 		// );
455 | 		// -U = U + (- 2 * U )
456 | 		//                |_mm_unpacklo_epi32(0,sign(VU,(-1,-1,-1,-1)))| ( little waste)
457 | 		// hadd[  ( V,U,V,U)     |      (0,-V,0,-U)      ]
458 | 		// |    V+U   |    U+V   |   -V     |   -U     |
459 | 		const __m128i CheckValues = _mm_hadd_epi32(
460 | 			_mm_unpacklo_epi32(
461 | 				_mm_sign_epi32(
462 | 					VU,
463 | 					_mm_set_epi32(-1,-1,-1,-1)
464 | 				),
465 | 				_mm_setzero_si128()
466 | 			),
467 | 			VU
468 | 		);
469 | 		// X <= Y;!( X > Y ); !( Y < X )
470 | 		const __m128i CheckParallel = _mm_cmplt_epi32(
471 | 			CheckConst,
472 | 			CheckValues
473 | 		);
474 | 		const std::uint16_t Mask = ~_mm_movemask_epi8(CheckParallel);
475 | 		Results[i] |= Mask == 0xFFFF;
476 | 		// |    <=    |    <=    |   <=     |   <=     |
477 | 		// | Area-1   | Area-1   |    0     |    0     | < const
478 | 
479 | 		// U = (blah) + Det20; U >= 0; U >= -Det20; -U <= Det20
480 | 		// V = (blah) + Det01; V >= 0; V >= -Det01; -V <= Det01
481 | 		// X >= 0 : !(X < 0)
482 | 		// const auto SignCheck = _mm_cmplt_epi32(
483 | 		// 	VU, _mm_setzero_si128()
484 | 		// );
485 | 
486 | 		// const auto AreaSignCheck = _mm_andnot_si128(
487 | 		// 	SignCheck, AreaCheck
488 | 		// );
489 | 		// Results[i] |= _mm_movemask_epi8(
490 | 		// 	AreaSignCheck
491 | 		// ) == 0xFFFF;
492 | 	}
493 | }
494 | #endif


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # qTriangle [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/Wunkolo/qTriangle/master/LICENSE) (WIP)
  2 | 
  3 | |||||
  4 | |:-:|:-:|:-:|:-:|
  5 | |Serial|SSE/NEON|AVX2|AVX512|
  6 | |![Serial](media/Serial.gif)|![SSE/NEON](media/SSE-NEON.gif)|![AVX2](media/AVX2.gif)|![AVX512](media/AVX512.gif)|
  7 | 
  8 | qTriangle is a personal study to design a **q**uick way to detect if a point is within a **Triangle** by means of vectorization
  9 | 
 10 | [![](media/glsl.png)](https://www.shadertoy.com/view/4t3czN)
 11 | 
 12 | 
 13 | ### [Check out a live GLSL sample here!](https://www.shadertoy.com/view/4t3czN)
 14 | 
 15 | The domain of the Point-In-Triangle problem is determining if a cartesian coordinate happens to land upon the interior of a triangle. In this case the 2D case of triangles will be examined and will require some kind of surface **area** for a point to land on so a case in which all three points are collinear(which is the worst case of having a *very* slim triangle) are ruled out.
 16 | 
 17 | This problem comes up a lot in the domain of computer graphics and gameplay programming at times. Sometimes it's testing if a single point lands upon a polygon(made up of triangles) and sometimes it's testing if thousands of points happen to land on a triangle or not(such as when rendering a vector triangle against a regular grid during rasterization).
 18 | 
 19 | There are several methods to test if a point happens to land within a triangle in 2D space, each have their own pros and cons and scaling properties.
 20 | 
 21 | # Cross Product Method
 22 | 
 23 | The cross-product operation in vector algebra is a binary operation that takes two vectors in 3D space and creates a new vector that is perpendicular to them both. You can think of the two vectors as describing some kind of plane in 2D space, and the cross product creates a new vector that is perpendicular to this plane. Though, there are two ways to create a vector perpendicular to a plane, by going "in" the plane and going "out" the plane.
 24 | 
 25 | The Cross-Product has a lot of useful properties but one of interest at the moment is the **magnitude** of the resulting vector of the cross product which will be the **area** of the parallelogram that the two original vectors create. Since the points are on the X-Y plane in 3D space this *magnitude* will always be the Z-component of the cross product since all vectors perpendicular to the orthogonal X-Y plane will take the form `[ 0, 0, (some value)]`. This becomes useful later on.
 26 | 
 27 | ![](media/Cross.gif)
 28 | 
 29 | The particular numerical value of this area is not of importance either but rather the *parity* of the area is of interest(while a negative-surface-area does not make sense, this value tells us something about two directional vectors). Notice that whenever the direction to the moving point goes to the "left" of the black directional vector, that the area becomes negative, but when it is to the "right", the area is positive.
 30 | This is due to the [right hand rule](https://en.wikipedia.org/wiki/Right-hand_rule) where the direction of positive-rotation being **clockwise** or **counter-clockwise** causes the order of the original two directional vectors to determine the proper orientation of the cross product.
 31 | 
 32 | The parity of the cross-product-area depending on which side the direction of the "point" lands on solves the problem at the same tone of turning each edge into a linear inequality then testing if the point solves each of them at once but in a somewhat more optimal way.
 33 | 
 34 | The three positional vectors of the triangle must be in **clockwise** or **counter-clockwise** order so that three directional vectors can be determinately created.
 35 | ```
 36 | EdgeDir0 = Vertex1 - Vertex0
 37 | EdgeDir1 = Vertex2 - Vertex1
 38 | EdgeDir2 = Vertex0 - Vertex2
 39 | ```
 40 | Then, three additional directional vectors can be made that point from the triangle vertex position to the point that is being tested against
 41 | ```
 42 | PointDir0 = Point - Vertex0
 43 | PointDir1 = Point - Vertex1
 44 | PointDir2 = Point - Vertex2
 45 | ```
 46 | 
 47 | Now, finding out if a point lands within the triangle is determined by using three cross-products, and checking if each area is positive. If they are all positive. Then the point is to the "right" of all the edges. If any of them are negative, then it is not within the triangle.
 48 | ```
 49 | | EdgeDir0 × PointDir0 | >= 0 &&
 50 | | EdgeDir1 × PointDir1 | >= 0 &&
 51 | | EdgeDir2 × PointDir2 | >= 0
 52 | ```
 53 | 
 54 | ![](media/CrossMethod.gif)
 55 | 
 56 | ## Optimizations
 57 | 
 58 | Previously it was determined that all cross products against the X-Y plane will take the form `[ 0, 0, (some value)]`. With this the arithmetic behind the cross-product operation can be much more simplified. Since the cross product can be [calculated using partial determinants of a 3x3 matrix](https://en.wikipedia.org/wiki/Rule_of_Sarrus) then attention only has to be given to the calculations that determine the Z-component alone which is but a 2x2 determinant of the two input vectors.
 59 | 
 60 | This means that if I had two vectors `A` and `B` on the X-Y plane. The cross-product's magnitude is simply:
 61 | ```
 62 | A.x * B.y - A.y * B.x;
 63 | ```
 64 | which reduces the previous arithmetic to:
 65 | ```
 66 | EdgeDir0.x * PointDir0.y - EdgeDir0.y * PointDir0.x >= 0 &&
 67 | EdgeDir1.x * PointDir1.y - EdgeDir1.y * PointDir1.x >= 0 &&
 68 | EdgeDir2.x * PointDir2.y - EdgeDir2.y * PointDir2.x >= 0
 69 | ```
 70 | 
 71 | The full pseudo-code:
 72 | ```cpp
 73 | // Point	   - Position that is being tested
 74 | // Vertex0,1,2 - Vertices of the triangle in **clockwise order**
 75 | 
 76 | // Directional vertices along the edges of the triangle in clockwise order
 77 | EdgeDir0 = Vertex1 - Vertex0
 78 | EdgeDir1 = Vertex2 - Vertex1
 79 | EdgeDir2 = Vertex0 - Vertex2
 80 | 
 81 | // Directional vertices pointing from the triangle vertices to the point
 82 | PointDir0 = Point - Vertex0
 83 | PointDir1 = Point - Vertex1
 84 | PointDir2 = Point - Vertex2
 85 | 
 86 | // Test if each cross-product results in a positive area
 87 | if(
 88 | 	EdgeDir0.x * PointDir0.y - EdgeDir0.y * PointDir0.x >= 0 &&
 89 | 	EdgeDir1.x * PointDir1.y - EdgeDir1.y * PointDir1.x >= 0 &&
 90 | 	EdgeDir2.x * PointDir2.y - EdgeDir2.y * PointDir2.x >= 0
 91 | )
 92 | {
 93 | 	// CurPoint is in triangle!
 94 | }
 95 | ```
 96 | 
 97 | ## Scaling
 98 | 
 99 | If I was to throw thousands of points at a triangle in a for-loop using this algorithm then not all variables have to be re-calculated for each point.
100 | 
101 | The vectors `EdgeDir0`, `EdgeDir1`, `EdgeDir2` only have to be calculated once. For each point the vectors `PointDir0`, `PointDir1`, `PointDir2` have to be recreated.
102 | 
103 | ```
104 | EdgeDir0 = Vertex1 - Vertex0
105 | EdgeDir1 = Vertex2 - Vertex1
106 | EdgeDir2 = Vertex0 - Vertex2
107 | foreach(CurPoint in LotsOfPoints)
108 | {
109 | 	PointDir0 = Point - Vertex0
110 | 	PointDir1 = Point - Vertex1
111 | 	PointDir2 = Point - Vertex2
112 | 	if(
113 | 		EdgeDir0.x * PointDir0.y - EdgeDir0.y * PointDir0.x >= 0 &&
114 | 		EdgeDir1.x * PointDir1.y - EdgeDir1.y * PointDir1.x >= 0 &&
115 | 		EdgeDir2.x * PointDir2.y - EdgeDir2.y * PointDir2.x >= 0
116 | 	)
117 | 	{
118 | 		// CurPoint is in triangle!
119 | 	}
120 | }
121 | ```
122 | Which results in the total overhead for each point being
123 | 
124 | Subtractions|Multiplications|Comparisons
125 | :-:|:-:|:-:
126 | 6|6|3
127 | 
128 | # Barycentric Coordinate Method
129 | 
130 | With some barycentric coordinate trickery one can derive a coordiante system that allows a triangle to be described as a linear "mixture" of its three vertices, with some constraints on how they mix.
131 | 
132 | The Barycentric coordinates of a triangle involves it's three position-vectors `p1`, `p2`, `p3`.
133 | Using these three points, any new point `p'` _within_ this triangle can be generated by *mixing* the three vertex positions according to three scalar weights `w1`, `w2`, `w3` such that:
134 | `p' = w1 * p1 + w2 * p2 + w3 * p3`.
135 | This is just a linear combination of three points, but `p'` isnt meant to just be any wild combination of three points. What you actually want is for the combination of three points to always land on the triangular surface that they all contain. This is called a [convex combination](https://en.wikipedia.org/wiki/Convex_combination) where if you want to only reach every point within this triangle then you must bind these three weight weight to the conditions of being non-negative and also summing to the exact value `1.0`. Making this the full barycentric equation for a triangle defined by three points:
136 | 
137 | <img src="https://latex.codecogs.com/png.latex?\dpi{150}&space;\\w_1&space;>=&space;0\\w_2&space;>=&space;0\\w_3&space;>=&space;0\\&space;p'&space;=&space;w_1&space;*&space;p_1&space;&plus;&space;w_2&space;*&space;p_2&space;&plus;&space;w_3&space;*&space;p_3\\&space;1&space;=&space;w_1&space;&plus;&space;w_2&space;&plus;&space;w_3\\&space;%&space;p'_x&space;=&space;w_1&space;*&space;p_1_x&space;&plus;&space;w_2&space;*&space;p_2_x&space;&plus;&space;w_3&space;*&space;p_3_x\\&space;p'_y&space;=&space;w_1&space;*&space;p_1_y&space;&plus;&space;w_2&space;*&space;p_2_y&space;&plus;&space;w_3&space;*&space;p_3_y\\" title="\\w_1 >= 0\\w_2 >= 0\\w_3 >= 0\\ p' = w_1 * p_1 + w_2 * p_2 + w_3 * p_3\\ 1 = w_1 + w_2 + w_3\\ % p'_x = w_1 * p_1_x + w_2 * p_2_x + w_3 * p_3_x\\ p'_y = w_1 * p_1_y + w_2 * p_2_y + w_3 * p_3_y\\" />
138 | 
139 | In this case though. **You already have `p'` and want to determine if it is within this triangle!** Time to cleverly work backwards!
140 | 
141 | ---
142 | 
143 | So given a `p1`, `p2`, `p3` and `p'`, you have to figure out the three positive unknowns `w1`, `w2`, `w3` that balance these conditions. Some linear alg!
144 | 
145 | So starting with this:
146 | 
147 | <img src="https://latex.codecogs.com/png.latex?\dpi{150}&space;\\w_1&space;>=&space;0\\w_2&space;>=&space;0\\w_3&space;>=&space;0\\&space;p'&space;=&space;w_1&space;*&space;p_1&space;&plus;&space;w_2&space;*&space;p_2&space;&plus;&space;w_3&space;*&space;p_3\\&space;1&space;=&space;w_1&space;&plus;&space;w_2&space;&plus;&space;w_3\\&space;%&space;p'_x&space;=&space;w_1&space;*&space;p_1_x&space;&plus;&space;w_2&space;*&space;p_2_x&space;&plus;&space;w_3&space;*&space;p_3_x\\&space;p'_y&space;=&space;w_1&space;*&space;p_1_y&space;&plus;&space;w_2&space;*&space;p_2_y&space;&plus;&space;w_3&space;*&space;p_3_y\\" title="\\w_1 >= 0\\w_2 >= 0\\w_3 >= 0\\ p' = w_1 * p_1 + w_2 * p_2 + w_3 * p_3\\ 1 = w_1 + w_2 + w_3\\ % p'_x = w_1 * p_1_x + w_2 * p_2_x + w_3 * p_3_x\\ p'_y = w_1 * p_1_y + w_2 * p_2_y + w_3 * p_3_y\\" />
148 | 
149 | How about expanding those vectors into their individual(2D) dimensions.
150 | 
151 | <img src="https://latex.codecogs.com/png.latex?\dpi{150}&space;\\w1&space;>=&space;0\\w2&space;>=&space;0\\w3&space;>=&space;0\\&space;p'_x&space;=&space;w_1&space;*&space;p_1_x&space;&plus;&space;w_2&space;*&space;p_2_x&space;&plus;&space;w_3&space;*&space;p_3_x\\&space;p'_y&space;=&space;w_1&space;*&space;p_1_y&space;&plus;&space;w_2&space;*&space;p_2_y&space;&plus;&space;w_3&space;*&space;p_3_y\\&space;1&space;=&space;w_1&space;&plus;&space;w_2&space;&plus;&space;w_3\\" title="\\w1 >= 0\\w2 >= 0\\w3 >= 0\\ p'_x = w_1 * p_1_x + w_2 * p_2_x + w_3 * p_3_x\\ p'_y = w_1 * p_1_y + w_2 * p_2_y + w_3 * p_3_y\\ 1 = w_1 + w_2 + w_3\\" />
152 | 
153 | Now that looks like a matrix! Take out all those weights and put it into a vector!
154 | 
155 | <img src="https://latex.codecogs.com/png.latex?\dpi{150}&space;\\w_1&space;>=&space;0\\w_2&space;>=&space;0\\w_3&space;>=&space;0\\&space;\begin{bmatrix}&space;p'_x\\&space;p'_y\\&space;1\\&space;\end{bmatrix}&space;=&space;\begin{bmatrix}&space;p_1_x&space;&&space;p_2_x&space;&&space;p_3_x&space;\\&space;p_1_y&space;&&space;p_2_y&space;&&space;p_3_y&space;\\&space;1&space;&&space;1&space;&&space;1&space;\end{bmatrix}&space;*&space;\begin{bmatrix}&space;w_1\\&space;w_2\\&space;w_3\\&space;\end{bmatrix}" title="\\w_1 >= 0\\w_2 >= 0\\w_3 >= 0\\ \begin{bmatrix} p'_x\\ p'_y\\ 1\\ \end{bmatrix} = \begin{bmatrix} p_1_x & p_2_x & p_3_x \\ p_1_y & p_2_y & p_3_y \\ 1 & 1 & 1 \end{bmatrix} * \begin{bmatrix} w_1\\ w_2\\ w_3\\ \end{bmatrix}" />
156 | 
157 | So the solution is to invert that 3x3 matrix there and multiplying it by the point we are testing it against to get the resulting three weights. And once you get the three weights. All you have to do is test if they are positive(**Note**: If this smells somewhat like a close derivation of the cross-product method you're right!)
158 | 
159 | <img src="https://latex.codecogs.com/png.latex?\dpi{150}&space;\\w_1&space;>=&space;0\\w_2&space;>=&space;0\\w_3&space;>=&space;0\\&space;\&space;\begin{bmatrix}&space;p_1_x&space;&&space;p_2_x&space;&&space;p_3_x&space;\\&space;p_1_y&space;&&space;p_2_y&space;&&space;p_3_y&space;\\&space;1&space;&&space;1&space;&&space;1&space;\end{bmatrix}&space;^{-1}&space;*&space;\begin{bmatrix}&space;p'_x\\&space;p'_y\\&space;1\\&space;\end{bmatrix}&space;=&space;\begin{bmatrix}&space;w_1\\&space;w_2\\&space;w_3\\&space;\end{bmatrix}" title="\\w_1 >= 0\\w_2 >= 0\\w_3 >= 0\\ \ \begin{bmatrix} p_1_x & p_2_x & p_3_x \\ p_1_y & p_2_y & p_3_y \\ 1 & 1 & 1 \end{bmatrix} ^{-1} * \begin{bmatrix} p'_x\\ p'_y\\ 1\\ \end{bmatrix} = \begin{bmatrix} w_1\\ w_2\\ w_3\\ \end{bmatrix}" />
160 | 
161 | Matrix inverse... This is where it gets a little hairy
162 | 
163 | <img src="https://latex.codecogs.com/png.latex?\dpi{150}&space;\\w_1&space;>=&space;0\\w_2&space;>=&space;0\\w_3&space;>=&space;0\\&space;\&space;Det&space;=&space;-p_2_x&space;p_1_y&plus;p_3_x&space;p_1_y&plus;p_1_x&space;p_2_y-p_3_x&space;p_2_y-p_1_x&space;p_3_y&plus;p_2_x&space;p_3_y\\&space;\\&space;\frac{1}{Det}&space;*&space;\begin{bmatrix}&space;p_2_y-p_3_y&space;&&space;-p_2_x&plus;p_3_x&space;&&space;-p_3_x&space;p_2_y&plus;p_2_x&space;p_3_y\\&space;-p_1_y&plus;p_3_y&space;&&space;p_1_x-p_3_x&space;&&space;p_3_x&space;p_1_y-p_1_x&space;p_3_y\\&space;p_1_y-p_2_y&space;&&space;-p_1_x&plus;p_2_x&space;&&space;-p_2_x&space;p_1_y&plus;p_1_x&space;p_2_y\\&space;\end{bmatrix}&space;*&space;\begin{bmatrix}&space;p'_x\\&space;p'_y\\&space;1\\&space;\end{bmatrix}&space;=&space;\begin{bmatrix}&space;w_1\\&space;w_2\\&space;w_3\\&space;\end{bmatrix}" title="\\w_1 >= 0\\w_2 >= 0\\w_3 >= 0\\ \ Det = -p_2_x p_1_y+p_3_x p_1_y+p_1_x p_2_y-p_3_x p_2_y-p_1_x p_3_y+p_2_x p_3_y\\ \\ \frac{1}{Det} * \begin{bmatrix} p_2_y-p_3_y & -p_2_x+p_3_x & -p_3_x p_2_y+p_2_x p_3_y\\ -p_1_y+p_3_y & p_1_x-p_3_x & p_3_x p_1_y-p_1_x p_3_y\\ p_1_y-p_2_y & -p_1_x+p_2_x & -p_2_x p_1_y+p_1_x p_2_y\\ \end{bmatrix} * \begin{bmatrix} p'_x\\ p'_y\\ 1\\ \end{bmatrix} = \begin{bmatrix} w_1\\ w_2\\ w_3\\ \end{bmatrix}" />
164 | 
165 | 
166 | And with this, `w1`, `w2`, and `w3` all reduce to relatively "simple" linear equations and all that would have to be checked is if they are greater than `0`!
167 | 
168 | Here is the equivalent code in GLSL. Keep in mind that GLSL defines matrices and vectors as column-major. So going `vec3(1)` means a 1x3(_width_ x _height_) arrangement of elements and `mat3(vec(1),vec(2),vec(3))` means a 3x3 matrix with the columns(the vertical ones) being defined by `{1,1,1}`,`{2,2,2}`,`{3,3,3}`.
169 | ```c
170 | bool PointInTriangleBarycentric(
171 | 	in vec2 Triangle[3],
172 | 	in vec2 Point
173 | )
174 | {
175 | 	mat3 Barycentric = inverse(
176 | 		mat3(
177 | 			Triangle[0], 1.0f,
178 | 			Triangle[1], 1.0f,
179 | 			Triangle[2], 1.0f
180 | 		)
181 | 	);
182 | 
183 | 	vec3 Weights = Barycentric * vec3( Point, 1.0 );
184 | 
185 | 	if(
186 | 		// Weights.x >= 0.0f &&
187 | 		// Weights.y >= 0.0f &&
188 | 		// Weights.z >= 0.0f
189 | 		all( greaterThanEqual( Weights, vec3(0.0f) ) )
190 | 	)
191 | 	{
192 | 		return true;
193 | 	}
194 | 	
195 | 	return false;
196 | }
197 | ```
198 | 
199 | Though, this is a pretty naive approach. It still has its uses.
200 | If you wanted to test a million points against a single triangle. You'd have to do a single matrix inverse(pretty expensive!) and then the overhead for testing each individual point you want to test would be: a matrix-vector multiplication(a 3x3 matrix times an ℝ³ vector, which is pretty much just three dot-products) and three comparisons:
201 | 
202 | Additions|Multiplications|Comparisons
203 | :-:|:-:|:-:
204 | 6|9|3
205 | 
206 | Though, a matrix inverse is pretty expensive operation to do. Especially in a context of having possibly thousands upon millions of triangles and a matrix inverse each involving almost dozens of multiplications, additions, and a division on top of it all. Some more clever observations can allow for some slim optimizations. Especially since all that matters is if the weights are positive or not.
207 | 
208 | ## Optimizations
209 | 
210 | 
211 | 
212 | ---
213 | 
214 | The two directional vectors are derived from three points that describe a plane while the two scalars are typically denoted as *U* and *V* and determine how much these two directional vectors should mix together to create another point on this plane.
215 | Since a triangle has three points, these two vectors can be derived by picking any one point of the triangle and obtaining two directional vectors from this point to the two other points.
216 | 
217 | The *U* and *V* scalar values are typically normalized within the [0.0,1.0] range to easily translate these values into *percentages* that determine how much much the two vectors should contribute to the resulting vector.
218 | Ex, if I had the two directional vectors `( 8, 2 )` and `( 7, 1 )` and the `U` `V` values `0.5` `1.0` respectively. The resulting point is ` ( 8, 2 ) * 0.5 + ( 7, 1 ) * 1.0 = ( 11, 3 )` which in english would be something like `I want 50% of the ( 8, 2 ) direction and 140% of ( 7, 1 )`.
219 | 
220 | The actual *triangle shape* is made by constraining the `U` and `V` values so that rather than describing a plane very generally it instead will stay within the bounds of a triangle.
221 | 
222 | 
223 | The first two constraints are `U >= 0` and `V >= 0` which guarentee that the U,V coordinates are always on the *positive* side of the two vectors and do not go backwards, off the triangle. The third constraint is `U + V <= 1` is [just the line](http://www.wolframalpha.com/input/?i=1+-+x+-+y++%3D+0) `y = 1 - x` [turned into an inequality](http://www.wolframalpha.com/input/?i=1+-+x+-+y++%3D%3E+0) such that all solutions to the inequality equate to a point within a triangle. The actual derivation of this involves some barycentric coordinate limbo.
224 | 
225 | 
226 | 
227 | Given a triangle, the two directional vectors are easy to calculate. Pick any of the three points of a triangle, get the vector direction from this point, to the two other points ( which is just a vector subtraction). After the two directional vectors are obtained, now all that has to be done to see if a point is within a triangle is [*projecting*](https://en.wikipedia.org/wiki/Vector_projection) this point against the two directional vectors to get the *U* and *V* values to test against the three conditions.
228 | 
229 | Projecting a point against these two vectors to get the *U* and *V* values is but trivial dot-product arithmetic. First, another directional vector has to be created as a positional-vector and a directional-vector wouldn't make sense in this instance. The very same point that was selected to in step 1 to generate the first two directional vectors must be used once more to generate a third directional vector between this triangle vertex and the point being sampled against (another vector subtraction). This new vector will then be dot-product-ed against the two directional edges ( vertial vector multiplication and horizontal addition ) to finally determine the `U` and `V` values to test against `U >= 0`, `V >= 0`, and `U + V <= 1`.
230 | 
231 | ![](media/BarycentricMethod.gif)
232 | 


--------------------------------------------------------------------------------
/test/stb_image_write.h:
--------------------------------------------------------------------------------
   1 | /* stb_image_write - v1.09 - public domain - http://nothings.org/stb/stb_image_write.h
   2 | writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
   3 | no warranty implied; use at your own risk
   4 | 
   5 | Before #including,
   6 | 
   7 | #define STB_IMAGE_WRITE_IMPLEMENTATION
   8 | 
   9 | in the file that you want to have the implementation.
  10 | 
  11 | Will probably not work correctly with strict-aliasing optimizations.
  12 | 
  13 | If using a modern Microsoft Compiler, non-safe versions of CRT calls may cause
  14 | compilation warnings or even errors. To avoid this, also before #including,
  15 | 
  16 | #define STBI_MSC_SECURE_CRT
  17 | 
  18 | ABOUT:
  19 | 
  20 | This header file is a library for writing images to C stdio. It could be
  21 | adapted to write to memory or a general streaming interface; let me know.
  22 | 
  23 | The PNG output is not optimal; it is 20-50% larger than the file
  24 | written by a decent optimizing implementation; though providing a custom
  25 | zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
  26 | This library is designed for source code compactness and simplicity,
  27 | not optimal image file size or run-time performance.
  28 | 
  29 | BUILDING:
  30 | 
  31 | You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
  32 | You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
  33 | malloc,realloc,free.
  34 | You can #define STBIW_MEMMOVE() to replace memmove()
  35 | You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
  36 | for PNG compression (instead of the builtin one), it must have the following signature:
  37 | unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
  38 | The returned data will be freed with STBIW_FREE() (free() by default),
  39 | so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
  40 | 
  41 | USAGE:
  42 | 
  43 | There are five functions, one for each image file format:
  44 | 
  45 | int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
  46 | int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
  47 | int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
  48 | int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
  49 | int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
  50 | 
  51 | void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
  52 | 
  53 | There are also five equivalent functions that use an arbitrary write function. You are
  54 | expected to open/close your file-equivalent before and after calling these:
  55 | 
  56 | int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
  57 | int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
  58 | int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
  59 | int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
  60 | int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
  61 | 
  62 | where the callback is:
  63 | void stbi_write_func(void *context, void *data, int size);
  64 | 
  65 | You can configure it with these global variables:
  66 | int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
  67 | int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
  68 | int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
  69 | 
  70 | 
  71 | You can define STBI_WRITE_NO_STDIO to disable the file variant of these
  72 | functions, so the library will not use stdio.h at all. However, this will
  73 | also disable HDR writing, because it requires stdio for formatted output.
  74 | 
  75 | Each function returns 0 on failure and non-0 on success.
  76 | 
  77 | The functions create an image file defined by the parameters. The image
  78 | is a rectangle of pixels stored from left-to-right, top-to-bottom.
  79 | Each pixel contains 'comp' channels of data stored interleaved with 8-bits
  80 | per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
  81 | monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
  82 | The *data pointer points to the first byte of the top-left-most pixel.
  83 | For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
  84 | a row of pixels to the first byte of the next row of pixels.
  85 | 
  86 | PNG creates output files with the same number of components as the input.
  87 | The BMP format expands Y to RGB in the file format and does not
  88 | output alpha.
  89 | 
  90 | PNG supports writing rectangles of data even when the bytes storing rows of
  91 | data are not consecutive in memory (e.g. sub-rectangles of a larger image),
  92 | by supplying the stride between the beginning of adjacent rows. The other
  93 | formats do not. (Thus you cannot write a native-format BMP through the BMP
  94 | writer, both because it is in BGR order and because it may have padding
  95 | at the end of the line.)
  96 | 
  97 | PNG allows you to set the deflate compression level by setting the global
  98 | variable 'stbi_write_png_compression_level' (it defaults to 8).
  99 | 
 100 | HDR expects linear float data. Since the format is always 32-bit rgb(e)
 101 | data, alpha (if provided) is discarded, and for monochrome data it is
 102 | replicated across all three channels.
 103 | 
 104 | TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
 105 | data, set the global variable 'stbi_write_tga_with_rle' to 0.
 106 | 
 107 | JPEG does ignore alpha channels in input data; quality is between 1 and 100.
 108 | Higher quality looks better but results in a bigger image.
 109 | JPEG baseline (no JPEG progressive).
 110 | 
 111 | CREDITS:
 112 | 
 113 | 
 114 | Sean Barrett           -    PNG/BMP/TGA
 115 | Baldur Karlsson        -    HDR
 116 | Jean-Sebastien Guay    -    TGA monochrome
 117 | Tim Kelsey             -    misc enhancements
 118 | Alan Hickman           -    TGA RLE
 119 | Emmanuel Julien        -    initial file IO callback implementation
 120 | Jon Olick              -    original jo_jpeg.cpp code
 121 | Daniel Gibson          -    integrate JPEG, allow external zlib
 122 | Aarni Koskela          -    allow choosing PNG filter
 123 | 
 124 | bugfixes:
 125 | github:Chribba
 126 | Guillaume Chereau
 127 | github:jry2
 128 | github:romigrou
 129 | Sergio Gonzalez
 130 | Jonas Karlsson
 131 | Filip Wasil
 132 | Thatcher Ulrich
 133 | github:poppolopoppo
 134 | Patrick Boettcher
 135 | github:xeekworx
 136 | Cap Petschulat
 137 | Simon Rodriguez
 138 | Ivan Tikhonov
 139 | github:ignotion
 140 | Adam Schackart
 141 | 
 142 | LICENSE
 143 | 
 144 | See end of file for license information.
 145 | 
 146 | */
 147 | 
 148 | #ifndef INCLUDE_STB_IMAGE_WRITE_H
 149 | #define INCLUDE_STB_IMAGE_WRITE_H
 150 | 
 151 | // if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
 152 | #ifndef STBIWDEF
 153 | #ifdef STB_IMAGE_WRITE_STATIC
 154 | #define STBIWDEF  static
 155 | #else
 156 | #ifdef __cplusplus
 157 | #define STBIWDEF  extern "C"
 158 | #else
 159 | #define STBIWDEF  extern
 160 | #endif
 161 | #endif
 162 | #endif
 163 | 
 164 | #ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
 165 | extern int stbi_write_tga_with_rle;
 166 | extern int stbi_write_png_compression_level;
 167 | extern int stbi_write_force_png_filter;
 168 | #endif
 169 | 
 170 | #ifndef STBI_WRITE_NO_STDIO
 171 | STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
 172 | STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
 173 | STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
 174 | STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
 175 | STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
 176 | #endif
 177 | 
 178 | typedef void stbi_write_func(void *context, void *data, int size);
 179 | 
 180 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
 181 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
 182 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
 183 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
 184 | STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
 185 | 
 186 | STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 187 | 
 188 | #endif//INCLUDE_STB_IMAGE_WRITE_H
 189 | 
 190 | #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 191 | 
 192 | #ifdef _WIN32
 193 | #ifndef _CRT_SECURE_NO_WARNINGS
 194 | #define _CRT_SECURE_NO_WARNINGS
 195 | #endif
 196 | #ifndef _CRT_NONSTDC_NO_DEPRECATE
 197 | #define _CRT_NONSTDC_NO_DEPRECATE
 198 | #endif
 199 | #endif
 200 | 
 201 | #ifndef STBI_WRITE_NO_STDIO
 202 | #include <stdio.h>
 203 | #endif // STBI_WRITE_NO_STDIO
 204 | 
 205 | #include <stdarg.h>
 206 | #include <stdlib.h>
 207 | #include <string.h>
 208 | #include <math.h>
 209 | 
 210 | #if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 211 | // ok
 212 | #elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
 213 | // ok
 214 | #else
 215 | #error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 216 | #endif
 217 | 
 218 | #ifndef STBIW_MALLOC
 219 | #define STBIW_MALLOC(sz)        malloc(sz)
 220 | #define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
 221 | #define STBIW_FREE(p)           free(p)
 222 | #endif
 223 | 
 224 | #ifndef STBIW_REALLOC_SIZED
 225 | #define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
 226 | #endif
 227 | 
 228 | 
 229 | #ifndef STBIW_MEMMOVE
 230 | #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
 231 | #endif
 232 | 
 233 | 
 234 | #ifndef STBIW_ASSERT
 235 | #include <assert.h>
 236 | #define STBIW_ASSERT(x) assert(x)
 237 | #endif
 238 | 
 239 | #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
 240 | 
 241 | #ifdef STB_IMAGE_WRITE_STATIC
 242 | static int stbi__flip_vertically_on_write = 0;
 243 | static int stbi_write_png_compression_level = 8;
 244 | static int stbi_write_tga_with_rle = 1;
 245 | static int stbi_write_force_png_filter = -1;
 246 | #else
 247 | int stbi_write_png_compression_level = 8;
 248 | int stbi__flip_vertically_on_write = 0;
 249 | int stbi_write_tga_with_rle = 1;
 250 | int stbi_write_force_png_filter = -1;
 251 | #endif
 252 | 
 253 | STBIWDEF void stbi_flip_vertically_on_write(int flag)
 254 | {
 255 | 	stbi__flip_vertically_on_write = flag;
 256 | }
 257 | 
 258 | typedef struct
 259 | {
 260 | 	stbi_write_func *func;
 261 | 	void *context;
 262 | } stbi__write_context;
 263 | 
 264 | // initialize a callback-based context
 265 | static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
 266 | {
 267 | 	s->func = c;
 268 | 	s->context = context;
 269 | }
 270 | 
 271 | #ifndef STBI_WRITE_NO_STDIO
 272 | 
 273 | static void stbi__stdio_write(void *context, void *data, int size)
 274 | {
 275 | 	fwrite(data, 1, size, (FILE*)context);
 276 | }
 277 | 
 278 | static int stbi__start_write_file(stbi__write_context *s, const char *filename)
 279 | {
 280 | 	FILE *f;
 281 | #ifdef STBI_MSC_SECURE_CRT
 282 | 	if( fopen_s(&f, filename, "wb") )
 283 | 		f = NULL;
 284 | #else
 285 | 	f = fopen(filename, "wb");
 286 | #endif
 287 | 	stbi__start_write_callbacks(s, stbi__stdio_write, (void *)f);
 288 | 	return f != NULL;
 289 | }
 290 | 
 291 | static void stbi__end_write_file(stbi__write_context *s)
 292 | {
 293 | 	fclose((FILE *)s->context);
 294 | }
 295 | 
 296 | #endif // !STBI_WRITE_NO_STDIO
 297 | 
 298 | typedef unsigned int stbiw_uint32;
 299 | typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
 300 | 
 301 | static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
 302 | {
 303 | 	while( *fmt )
 304 | 	{
 305 | 		switch( *fmt++ )
 306 | 		{
 307 | 		case ' ': break;
 308 | 		case '1':
 309 | 		{
 310 | 			unsigned char x = STBIW_UCHAR(va_arg(v, int));
 311 | 			s->func(s->context, &x, 1);
 312 | 			break;
 313 | 		}
 314 | 		case '2':
 315 | 		{
 316 | 			int x = va_arg(v, int);
 317 | 			unsigned char b[2];
 318 | 			b[0] = STBIW_UCHAR(x);
 319 | 			b[1] = STBIW_UCHAR(x >> 8);
 320 | 			s->func(s->context, b, 2);
 321 | 			break;
 322 | 		}
 323 | 		case '4':
 324 | 		{
 325 | 			stbiw_uint32 x = va_arg(v, int);
 326 | 			unsigned char b[4];
 327 | 			b[0] = STBIW_UCHAR(x);
 328 | 			b[1] = STBIW_UCHAR(x >> 8);
 329 | 			b[2] = STBIW_UCHAR(x >> 16);
 330 | 			b[3] = STBIW_UCHAR(x >> 24);
 331 | 			s->func(s->context, b, 4);
 332 | 			break;
 333 | 		}
 334 | 		default:
 335 | 			STBIW_ASSERT(0);
 336 | 			return;
 337 | 		}
 338 | 	}
 339 | }
 340 | 
 341 | static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
 342 | {
 343 | 	va_list v;
 344 | 	va_start(v, fmt);
 345 | 	stbiw__writefv(s, fmt, v);
 346 | 	va_end(v);
 347 | }
 348 | 
 349 | static void stbiw__putc(stbi__write_context *s, unsigned char c)
 350 | {
 351 | 	s->func(s->context, &c, 1);
 352 | }
 353 | 
 354 | static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 355 | {
 356 | 	unsigned char arr[3];
 357 | 	arr[0] = a, arr[1] = b, arr[2] = c;
 358 | 	s->func(s->context, arr, 3);
 359 | }
 360 | 
 361 | static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
 362 | {
 363 | 	unsigned char bg[3] = { 255, 0, 255 }, px[3];
 364 | 	int k;
 365 | 
 366 | 	if( write_alpha < 0 )
 367 | 		s->func(s->context, &d[comp - 1], 1);
 368 | 
 369 | 	switch( comp )
 370 | 	{
 371 | 	case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
 372 | 	case 1:
 373 | 		if( expand_mono )
 374 | 			stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
 375 | 		else
 376 | 			s->func(s->context, d, 1);  // monochrome TGA
 377 | 		break;
 378 | 	case 4:
 379 | 		if( !write_alpha )
 380 | 		{
 381 | 			// composite against pink background
 382 | 			for( k = 0; k < 3; ++k )
 383 | 				px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
 384 | 			stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
 385 | 			break;
 386 | 		}
 387 | 		/* FALLTHROUGH */
 388 | 	case 3:
 389 | 		stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
 390 | 		break;
 391 | 	}
 392 | 	if( write_alpha > 0 )
 393 | 		s->func(s->context, &d[comp - 1], 1);
 394 | }
 395 | 
 396 | static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
 397 | {
 398 | 	stbiw_uint32 zero = 0;
 399 | 	int i, j, j_end;
 400 | 
 401 | 	if( y <= 0 )
 402 | 		return;
 403 | 
 404 | 	if( stbi__flip_vertically_on_write )
 405 | 		vdir *= -1;
 406 | 
 407 | 	if( vdir < 0 )
 408 | 		j_end = -1, j = y - 1;
 409 | 	else
 410 | 		j_end = y, j = 0;
 411 | 
 412 | 	for( ; j != j_end; j += vdir )
 413 | 	{
 414 | 		for( i = 0; i < x; ++i )
 415 | 		{
 416 | 			unsigned char *d = (unsigned char *)data + (j*x + i)*comp;
 417 | 			stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
 418 | 		}
 419 | 		s->func(s->context, &zero, scanline_pad);
 420 | 	}
 421 | }
 422 | 
 423 | static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
 424 | {
 425 | 	if( y < 0 || x < 0 )
 426 | 	{
 427 | 		return 0;
 428 | 	}
 429 | 	else
 430 | 	{
 431 | 		va_list v;
 432 | 		va_start(v, fmt);
 433 | 		stbiw__writefv(s, fmt, v);
 434 | 		va_end(v);
 435 | 		stbiw__write_pixels(s, rgb_dir, vdir, x, y, comp, data, alpha, pad, expand_mono);
 436 | 		return 1;
 437 | 	}
 438 | }
 439 | 
 440 | static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 441 | {
 442 | 	int pad = (-x * 3) & 3;
 443 | 	return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 0, pad,
 444 | 		"11 4 22 4" "4 44 22 444444",
 445 | 		'B', 'M', 14 + 40 + (x * 3 + pad)*y, 0, 0, 14 + 40,  // file header
 446 | 		40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);             // bitmap header
 447 | }
 448 | 
 449 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 450 | {
 451 | 	stbi__write_context s;
 452 | 	stbi__start_write_callbacks(&s, func, context);
 453 | 	return stbi_write_bmp_core(&s, x, y, comp, data);
 454 | }
 455 | 
 456 | #ifndef STBI_WRITE_NO_STDIO
 457 | STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
 458 | {
 459 | 	stbi__write_context s;
 460 | 	if( stbi__start_write_file(&s, filename) )
 461 | 	{
 462 | 		int r = stbi_write_bmp_core(&s, x, y, comp, data);
 463 | 		stbi__end_write_file(&s);
 464 | 		return r;
 465 | 	}
 466 | 	else
 467 | 		return 0;
 468 | }
 469 | #endif //!STBI_WRITE_NO_STDIO
 470 | 
 471 | static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
 472 | {
 473 | 	int has_alpha = (comp == 2 || comp == 4);
 474 | 	int colorbytes = has_alpha ? comp - 1 : comp;
 475 | 	int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
 476 | 
 477 | 	if( y < 0 || x < 0 )
 478 | 		return 0;
 479 | 
 480 | 	if( !stbi_write_tga_with_rle )
 481 | 	{
 482 | 		return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *)data, has_alpha, 0,
 483 | 			"111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
 484 | 	}
 485 | 	else
 486 | 	{
 487 | 		int i, j, k;
 488 | 		int jend, jdir;
 489 | 
 490 | 		stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
 491 | 
 492 | 		if( stbi__flip_vertically_on_write )
 493 | 		{
 494 | 			j = 0;
 495 | 			jend = y;
 496 | 			jdir = 1;
 497 | 		}
 498 | 		else
 499 | 		{
 500 | 			j = y - 1;
 501 | 			jend = -1;
 502 | 			jdir = -1;
 503 | 		}
 504 | 		for( ; j != jend; j += jdir )
 505 | 		{
 506 | 			unsigned char *row = (unsigned char *)data + j * x * comp;
 507 | 			int len;
 508 | 
 509 | 			for( i = 0; i < x; i += len )
 510 | 			{
 511 | 				unsigned char *begin = row + i * comp;
 512 | 				int diff = 1;
 513 | 				len = 1;
 514 | 
 515 | 				if( i < x - 1 )
 516 | 				{
 517 | 					++len;
 518 | 					diff = memcmp(begin, row + (i + 1) * comp, comp);
 519 | 					if( diff )
 520 | 					{
 521 | 						const unsigned char *prev = begin;
 522 | 						for( k = i + 2; k < x && len < 128; ++k )
 523 | 						{
 524 | 							if( memcmp(prev, row + k * comp, comp) )
 525 | 							{
 526 | 								prev += comp;
 527 | 								++len;
 528 | 							}
 529 | 							else
 530 | 							{
 531 | 								--len;
 532 | 								break;
 533 | 							}
 534 | 						}
 535 | 					}
 536 | 					else
 537 | 					{
 538 | 						for( k = i + 2; k < x && len < 128; ++k )
 539 | 						{
 540 | 							if( !memcmp(begin, row + k * comp, comp) )
 541 | 							{
 542 | 								++len;
 543 | 							}
 544 | 							else
 545 | 							{
 546 | 								break;
 547 | 							}
 548 | 						}
 549 | 					}
 550 | 				}
 551 | 
 552 | 				if( diff )
 553 | 				{
 554 | 					unsigned char header = STBIW_UCHAR(len - 1);
 555 | 					s->func(s->context, &header, 1);
 556 | 					for( k = 0; k < len; ++k )
 557 | 					{
 558 | 						stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
 559 | 					}
 560 | 				}
 561 | 				else
 562 | 				{
 563 | 					unsigned char header = STBIW_UCHAR(len - 129);
 564 | 					s->func(s->context, &header, 1);
 565 | 					stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
 566 | 				}
 567 | 			}
 568 | 		}
 569 | 	}
 570 | 	return 1;
 571 | }
 572 | 
 573 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
 574 | {
 575 | 	stbi__write_context s;
 576 | 	stbi__start_write_callbacks(&s, func, context);
 577 | 	return stbi_write_tga_core(&s, x, y, comp, (void *)data);
 578 | }
 579 | 
 580 | #ifndef STBI_WRITE_NO_STDIO
 581 | STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
 582 | {
 583 | 	stbi__write_context s;
 584 | 	if( stbi__start_write_file(&s, filename) )
 585 | 	{
 586 | 		int r = stbi_write_tga_core(&s, x, y, comp, (void *)data);
 587 | 		stbi__end_write_file(&s);
 588 | 		return r;
 589 | 	}
 590 | 	else
 591 | 		return 0;
 592 | }
 593 | #endif
 594 | 
 595 | // *************************************************************************************************
 596 | // Radiance RGBE HDR writer
 597 | // by Baldur Karlsson
 598 | 
 599 | #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 600 | 
 601 | void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 602 | {
 603 | 	int exponent;
 604 | 	float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 605 | 
 606 | 	if( maxcomp < 1e-32f )
 607 | 	{
 608 | 		rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
 609 | 	}
 610 | 	else
 611 | 	{
 612 | 		float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp;
 613 | 
 614 | 		rgbe[0] = (unsigned char)(linear[0] * normalize);
 615 | 		rgbe[1] = (unsigned char)(linear[1] * normalize);
 616 | 		rgbe[2] = (unsigned char)(linear[2] * normalize);
 617 | 		rgbe[3] = (unsigned char)(exponent + 128);
 618 | 	}
 619 | }
 620 | 
 621 | void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
 622 | {
 623 | 	unsigned char lengthbyte = STBIW_UCHAR(length + 128);
 624 | 	STBIW_ASSERT(length + 128 <= 255);
 625 | 	s->func(s->context, &lengthbyte, 1);
 626 | 	s->func(s->context, &databyte, 1);
 627 | }
 628 | 
 629 | void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
 630 | {
 631 | 	unsigned char lengthbyte = STBIW_UCHAR(length);
 632 | 	STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
 633 | 	s->func(s->context, &lengthbyte, 1);
 634 | 	s->func(s->context, data, length);
 635 | }
 636 | 
 637 | void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
 638 | {
 639 | 	unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
 640 | 	unsigned char rgbe[4];
 641 | 	float linear[3];
 642 | 	int x;
 643 | 
 644 | 	scanlineheader[2] = (width & 0xff00) >> 8;
 645 | 	scanlineheader[3] = (width & 0x00ff);
 646 | 
 647 | 	/* skip RLE for images too small or large */
 648 | 	if( width < 8 || width >= 32768 )
 649 | 	{
 650 | 		for( x = 0; x < width; x++ )
 651 | 		{
 652 | 			switch( ncomp )
 653 | 			{
 654 | 			case 4: /* fallthrough */
 655 | 			case 3: linear[2] = scanline[x*ncomp + 2];
 656 | 				linear[1] = scanline[x*ncomp + 1];
 657 | 				linear[0] = scanline[x*ncomp + 0];
 658 | 				break;
 659 | 			default:
 660 | 				linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
 661 | 				break;
 662 | 			}
 663 | 			stbiw__linear_to_rgbe(rgbe, linear);
 664 | 			s->func(s->context, rgbe, 4);
 665 | 		}
 666 | 	}
 667 | 	else
 668 | 	{
 669 | 		int c, r;
 670 | 		/* encode into scratch buffer */
 671 | 		for( x = 0; x < width; x++ )
 672 | 		{
 673 | 			switch( ncomp )
 674 | 			{
 675 | 			case 4: /* fallthrough */
 676 | 			case 3: linear[2] = scanline[x*ncomp + 2];
 677 | 				linear[1] = scanline[x*ncomp + 1];
 678 | 				linear[0] = scanline[x*ncomp + 0];
 679 | 				break;
 680 | 			default:
 681 | 				linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
 682 | 				break;
 683 | 			}
 684 | 			stbiw__linear_to_rgbe(rgbe, linear);
 685 | 			scratch[x + width * 0] = rgbe[0];
 686 | 			scratch[x + width * 1] = rgbe[1];
 687 | 			scratch[x + width * 2] = rgbe[2];
 688 | 			scratch[x + width * 3] = rgbe[3];
 689 | 		}
 690 | 
 691 | 		s->func(s->context, scanlineheader, 4);
 692 | 
 693 | 		/* RLE each component separately */
 694 | 		for( c = 0; c < 4; c++ )
 695 | 		{
 696 | 			unsigned char *comp = &scratch[width*c];
 697 | 
 698 | 			x = 0;
 699 | 			while( x < width )
 700 | 			{
 701 | 				// find first run
 702 | 				r = x;
 703 | 				while( r + 2 < width )
 704 | 				{
 705 | 					if( comp[r] == comp[r + 1] && comp[r] == comp[r + 2] )
 706 | 						break;
 707 | 					++r;
 708 | 				}
 709 | 				if( r + 2 >= width )
 710 | 					r = width;
 711 | 				// dump up to first run
 712 | 				while( x < r )
 713 | 				{
 714 | 					int len = r - x;
 715 | 					if( len > 128 ) len = 128;
 716 | 					stbiw__write_dump_data(s, len, &comp[x]);
 717 | 					x += len;
 718 | 				}
 719 | 				// if there's a run, output it
 720 | 				if( r + 2 < width )
 721 | 				{ // same test as what we break out of in search loop, so only true if we break'd
 722 | 				  // find next byte after run
 723 | 					while( r < width && comp[r] == comp[x] )
 724 | 						++r;
 725 | 					// output run up to r
 726 | 					while( x < r )
 727 | 					{
 728 | 						int len = r - x;
 729 | 						if( len > 127 ) len = 127;
 730 | 						stbiw__write_run_data(s, len, comp[x]);
 731 | 						x += len;
 732 | 					}
 733 | 				}
 734 | 			}
 735 | 		}
 736 | 	}
 737 | }
 738 | 
 739 | static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
 740 | {
 741 | 	if( y <= 0 || x <= 0 || data == NULL )
 742 | 		return 0;
 743 | 	else
 744 | 	{
 745 | 		// Each component is stored separately. Allocate scratch space for full output scanline.
 746 | 		unsigned char *scratch = (unsigned char *)STBIW_MALLOC(x * 4);
 747 | 		int i, len;
 748 | 		char buffer[128];
 749 | 		char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
 750 | 		s->func(s->context, header, sizeof(header) - 1);
 751 | 
 752 | 	#ifdef STBI_MSC_SECURE_CRT
 753 | 		len = sprintf_s(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 754 | 	#else
 755 | 		len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 756 | 	#endif
 757 | 		s->func(s->context, buffer, len);
 758 | 
 759 | 		for( i = 0; i < y; i++ )
 760 | 			stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp * x*(stbi__flip_vertically_on_write ? y - 1 - i : i)*x);
 761 | 		STBIW_FREE(scratch);
 762 | 		return 1;
 763 | 	}
 764 | }
 765 | 
 766 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
 767 | {
 768 | 	stbi__write_context s;
 769 | 	stbi__start_write_callbacks(&s, func, context);
 770 | 	return stbi_write_hdr_core(&s, x, y, comp, (float *)data);
 771 | }
 772 | 
 773 | #ifndef STBI_WRITE_NO_STDIO
 774 | STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
 775 | {
 776 | 	stbi__write_context s;
 777 | 	if( stbi__start_write_file(&s, filename) )
 778 | 	{
 779 | 		int r = stbi_write_hdr_core(&s, x, y, comp, (float *)data);
 780 | 		stbi__end_write_file(&s);
 781 | 		return r;
 782 | 	}
 783 | 	else
 784 | 		return 0;
 785 | }
 786 | #endif // STBI_WRITE_NO_STDIO
 787 | 
 788 | 
 789 | //////////////////////////////////////////////////////////////////////////////
 790 | //
 791 | // PNG writer
 792 | //
 793 | 
 794 | #ifndef STBIW_ZLIB_COMPRESS
 795 | // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
 796 | #define stbiw__sbraw(a) ((int *) (a) - 2)
 797 | #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
 798 | #define stbiw__sbn(a)   stbiw__sbraw(a)[1]
 799 | 
 800 | #define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
 801 | #define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
 802 | #define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
 803 | 
 804 | #define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
 805 | #define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
 806 | #define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
 807 | 
 808 | static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 809 | {
 810 | 	int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1;
 811 | 	void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int) * 2) : 0, itemsize * m + sizeof(int) * 2);
 812 | 	STBIW_ASSERT(p);
 813 | 	if( p )
 814 | 	{
 815 | 		if( !*arr ) ((int *)p)[1] = 0;
 816 | 		*arr = (void *)((int *)p + 2);
 817 | 		stbiw__sbm(*arr) = m;
 818 | 	}
 819 | 	return *arr;
 820 | }
 821 | 
 822 | static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
 823 | {
 824 | 	while( *bitcount >= 8 )
 825 | 	{
 826 | 		stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
 827 | 		*bitbuffer >>= 8;
 828 | 		*bitcount -= 8;
 829 | 	}
 830 | 	return data;
 831 | }
 832 | 
 833 | static int stbiw__zlib_bitrev(int code, int codebits)
 834 | {
 835 | 	int res = 0;
 836 | 	while( codebits-- )
 837 | 	{
 838 | 		res = (res << 1) | (code & 1);
 839 | 		code >>= 1;
 840 | 	}
 841 | 	return res;
 842 | }
 843 | 
 844 | static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
 845 | {
 846 | 	int i;
 847 | 	for( i = 0; i < limit && i < 258; ++i )
 848 | 		if( a[i] != b[i] ) break;
 849 | 	return i;
 850 | }
 851 | 
 852 | static unsigned int stbiw__zhash(unsigned char *data)
 853 | {
 854 | 	stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
 855 | 	hash ^= hash << 3;
 856 | 	hash += hash >> 5;
 857 | 	hash ^= hash << 4;
 858 | 	hash += hash >> 17;
 859 | 	hash ^= hash << 25;
 860 | 	hash += hash >> 6;
 861 | 	return hash;
 862 | }
 863 | 
 864 | #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
 865 | #define stbiw__zlib_add(code,codebits) \
 866 |       (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
 867 | #define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
 868 | // default huffman tables
 869 | #define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
 870 | #define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
 871 | #define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
 872 | #define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
 873 | #define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
 874 | #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
 875 | 
 876 | #define stbiw__ZHASH   16384
 877 | 
 878 | #endif // STBIW_ZLIB_COMPRESS
 879 | 
 880 | unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
 881 | {
 882 | #ifdef STBIW_ZLIB_COMPRESS
 883 | 	// user provided a zlib compress implementation, use that
 884 | 	return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
 885 | #else // use builtin
 886 | 	static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
 887 | 	static unsigned char  lengtheb[] = { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
 888 | 	static unsigned short distc[] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
 889 | 	static unsigned char  disteb[] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
 890 | 	unsigned int bitbuf = 0;
 891 | 	int i, j, bitcount = 0;
 892 | 	unsigned char *out = NULL;
 893 | 	unsigned char ***hash_table = (unsigned char***)STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
 894 | 	if( hash_table == NULL )
 895 | 		return NULL;
 896 | 	if( quality < 5 ) quality = 5;
 897 | 
 898 | 	stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
 899 | 	stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
 900 | 	stbiw__zlib_add(1, 1);  // BFINAL = 1
 901 | 	stbiw__zlib_add(1, 2);  // BTYPE = 1 -- fixed huffman
 902 | 
 903 | 	for( i = 0; i < stbiw__ZHASH; ++i )
 904 | 		hash_table[i] = NULL;
 905 | 
 906 | 	i = 0;
 907 | 	while( i < data_len - 3 )
 908 | 	{
 909 | 		// hash next 3 bytes of data to be compressed
 910 | 		int h = stbiw__zhash(data + i)&(stbiw__ZHASH - 1), best = 3;
 911 | 		unsigned char *bestloc = 0;
 912 | 		unsigned char **hlist = hash_table[h];
 913 | 		int n = stbiw__sbcount(hlist);
 914 | 		for( j = 0; j < n; ++j )
 915 | 		{
 916 | 			if( hlist[j] - data > i - 32768 )
 917 | 			{ // if entry lies within window
 918 | 				int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i);
 919 | 				if( d >= best ) best = d, bestloc = hlist[j];
 920 | 			}
 921 | 		}
 922 | 		// when hash table entry is too long, delete half the entries
 923 | 		if( hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality )
 924 | 		{
 925 | 			STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, sizeof(hash_table[h][0])*quality);
 926 | 			stbiw__sbn(hash_table[h]) = quality;
 927 | 		}
 928 | 		stbiw__sbpush(hash_table[h], data + i);
 929 | 
 930 | 		if( bestloc )
 931 | 		{
 932 | 			// "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
 933 | 			h = stbiw__zhash(data + i + 1)&(stbiw__ZHASH - 1);
 934 | 			hlist = hash_table[h];
 935 | 			n = stbiw__sbcount(hlist);
 936 | 			for( j = 0; j < n; ++j )
 937 | 			{
 938 | 				if( hlist[j] - data > i - 32767 )
 939 | 				{
 940 | 					int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1);
 941 | 					if( e > best )
 942 | 					{ // if next match is better, bail on current match
 943 | 						bestloc = NULL;
 944 | 						break;
 945 | 					}
 946 | 				}
 947 | 			}
 948 | 		}
 949 | 
 950 | 		if( bestloc )
 951 | 		{
 952 | 			int d = (int)(data + i - bestloc); // distance back
 953 | 			STBIW_ASSERT(d <= 32767 && best <= 258);
 954 | 			for( j = 0; best > lengthc[j + 1] - 1; ++j );
 955 | 			stbiw__zlib_huff(j + 257);
 956 | 			if( lengtheb[j] ) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
 957 | 			for( j = 0; d > distc[j + 1] - 1; ++j );
 958 | 			stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5);
 959 | 			if( disteb[j] ) stbiw__zlib_add(d - distc[j], disteb[j]);
 960 | 			i += best;
 961 | 		}
 962 | 		else
 963 | 		{
 964 | 			stbiw__zlib_huffb(data[i]);
 965 | 			++i;
 966 | 		}
 967 | 	}
 968 | 	// write out final bytes
 969 | 	for( ; i < data_len; ++i )
 970 | 		stbiw__zlib_huffb(data[i]);
 971 | 	stbiw__zlib_huff(256); // end of block
 972 | 						   // pad with 0 bits to byte boundary
 973 | 	while( bitcount )
 974 | 		stbiw__zlib_add(0, 1);
 975 | 
 976 | 	for( i = 0; i < stbiw__ZHASH; ++i )
 977 | 		(void)stbiw__sbfree(hash_table[i]);
 978 | 	STBIW_FREE(hash_table);
 979 | 
 980 | 	{
 981 | 		// compute adler32 on input
 982 | 		unsigned int s1 = 1, s2 = 0;
 983 | 		int blocklen = (int)(data_len % 5552);
 984 | 		j = 0;
 985 | 		while( j < data_len )
 986 | 		{
 987 | 			for( i = 0; i < blocklen; ++i ) s1 += data[j + i], s2 += s1;
 988 | 			s1 %= 65521, s2 %= 65521;
 989 | 			j += blocklen;
 990 | 			blocklen = 5552;
 991 | 		}
 992 | 		stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
 993 | 		stbiw__sbpush(out, STBIW_UCHAR(s2));
 994 | 		stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
 995 | 		stbiw__sbpush(out, STBIW_UCHAR(s1));
 996 | 	}
 997 | 	*out_len = stbiw__sbn(out);
 998 | 	// make returned pointer freeable
 999 | 	STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
1000 | 	return (unsigned char *)stbiw__sbraw(out);
1001 | #endif // STBIW_ZLIB_COMPRESS
1002 | }
1003 | 
1004 | static unsigned int stbiw__crc32(unsigned char *buffer, int len)
1005 | {
1006 | 	static unsigned int crc_table[256] =
1007 | 	{
1008 | 		0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
1009 | 		0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
1010 | 		0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
1011 | 		0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
1012 | 		0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
1013 | 		0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
1014 | 		0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
1015 | 		0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
1016 | 		0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
1017 | 		0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
1018 | 		0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
1019 | 		0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
1020 | 		0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
1021 | 		0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
1022 | 		0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
1023 | 		0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
1024 | 		0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
1025 | 		0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
1026 | 		0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
1027 | 		0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
1028 | 		0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
1029 | 		0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
1030 | 		0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
1031 | 		0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
1032 | 		0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
1033 | 		0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
1034 | 		0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
1035 | 		0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
1036 | 		0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
1037 | 		0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
1038 | 		0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
1039 | 		0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
1040 | 	};
1041 | 
1042 | 	unsigned int crc = ~0u;
1043 | 	int i;
1044 | 	for( i = 0; i < len; ++i )
1045 | 		crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
1046 | 	return ~crc;
1047 | }
1048 | 
1049 | #define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
1050 | #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
1051 | #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
1052 | 
1053 | static void stbiw__wpcrc(unsigned char **data, int len)
1054 | {
1055 | 	unsigned int crc = stbiw__crc32(*data - len - 4, len + 4);
1056 | 	stbiw__wp32(*data, crc);
1057 | }
1058 | 
1059 | static unsigned char stbiw__paeth(int a, int b, int c)
1060 | {
1061 | 	int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
1062 | 	if( pa <= pb && pa <= pc ) return STBIW_UCHAR(a);
1063 | 	if( pb <= pc ) return STBIW_UCHAR(b);
1064 | 	return STBIW_UCHAR(c);
1065 | }
1066 | 
1067 | // @OPTIMIZE: provide an option that always forces left-predict or paeth predict
1068 | static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
1069 | {
1070 | 	static int mapping[] = { 0,1,2,3,4 };
1071 | 	static int firstmap[] = { 0,1,0,5,6 };
1072 | 	int *mymap = (y != 0) ? mapping : firstmap;
1073 | 	int i;
1074 | 	int type = mymap[filter_type];
1075 | 	unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
1076 | 	int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
1077 | 	for( i = 0; i < n; ++i )
1078 | 	{
1079 | 		switch( type )
1080 | 		{
1081 | 		case 0: line_buffer[i] = z[i]; break;
1082 | 		case 1: line_buffer[i] = z[i]; break;
1083 | 		case 2: line_buffer[i] = z[i] - z[i - signed_stride]; break;
1084 | 		case 3: line_buffer[i] = z[i] - (z[i - signed_stride] >> 1); break;
1085 | 		case 4: line_buffer[i] = (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0)); break;
1086 | 		case 5: line_buffer[i] = z[i]; break;
1087 | 		case 6: line_buffer[i] = z[i]; break;
1088 | 		}
1089 | 	}
1090 | 	for( i = n; i < width*n; ++i )
1091 | 	{
1092 | 		switch( type )
1093 | 		{
1094 | 		case 0: line_buffer[i] = z[i]; break;
1095 | 		case 1: line_buffer[i] = z[i] - z[i - n]; break;
1096 | 		case 2: line_buffer[i] = z[i] - z[i - signed_stride]; break;
1097 | 		case 3: line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1); break;
1098 | 		case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]); break;
1099 | 		case 5: line_buffer[i] = z[i] - (z[i - n] >> 1); break;
1100 | 		case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0); break;
1101 | 		}
1102 | 	}
1103 | }
1104 | 
1105 | unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
1106 | {
1107 | 	int force_filter = stbi_write_force_png_filter;
1108 | 	int ctype[5] = { -1, 0, 4, 2, 6 };
1109 | 	unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
1110 | 	unsigned char *out, *o, *filt, *zlib;
1111 | 	signed char *line_buffer;
1112 | 	int j, zlen;
1113 | 
1114 | 	if( stride_bytes == 0 )
1115 | 		stride_bytes = x * n;
1116 | 
1117 | 	if( force_filter >= 5 )
1118 | 	{
1119 | 		force_filter = -1;
1120 | 	}
1121 | 
1122 | 	filt = (unsigned char *)STBIW_MALLOC((x*n + 1) * y); if( !filt ) return 0;
1123 | 	line_buffer = (signed char *)STBIW_MALLOC(x * n); if( !line_buffer ) { STBIW_FREE(filt); return 0; }
1124 | 	for( j = 0; j < y; ++j )
1125 | 	{
1126 | 		int filter_type;
1127 | 		if( force_filter > -1 )
1128 | 		{
1129 | 			filter_type = force_filter;
1130 | 			stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer);
1131 | 		}
1132 | 		else
1133 | 		{ // Estimate the best filter by running through all of them:
1134 | 			int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
1135 | 			for( filter_type = 0; filter_type < 5; filter_type++ )
1136 | 			{
1137 | 				stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer);
1138 | 
1139 | 				// Estimate the entropy of the line using this filter; the less, the better.
1140 | 				est = 0;
1141 | 				for( i = 0; i < x*n; ++i )
1142 | 				{
1143 | 					est += abs((signed char)line_buffer[i]);
1144 | 				}
1145 | 				if( est < best_filter_val )
1146 | 				{
1147 | 					best_filter_val = est;
1148 | 					best_filter = filter_type;
1149 | 				}
1150 | 			}
1151 | 			if( filter_type != best_filter )
1152 | 			{  // If the last iteration already got us the best filter, don't redo it
1153 | 				stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer);
1154 | 				filter_type = best_filter;
1155 | 			}
1156 | 		}
1157 | 		// when we get here, filter_type contains the filter type, and line_buffer contains the data
1158 | 		filt[j*(x*n + 1)] = (unsigned char)filter_type;
1159 | 		STBIW_MEMMOVE(filt + j * (x*n + 1) + 1, line_buffer, x*n);
1160 | 	}
1161 | 	STBIW_FREE(line_buffer);
1162 | 	zlib = stbi_zlib_compress(filt, y*(x*n + 1), &zlen, stbi_write_png_compression_level);
1163 | 	STBIW_FREE(filt);
1164 | 	if( !zlib ) return 0;
1165 | 
1166 | 	// each tag requires 12 bytes of overhead
1167 | 	out = (unsigned char *)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
1168 | 	if( !out ) return 0;
1169 | 	*out_len = 8 + 12 + 13 + 12 + zlen + 12;
1170 | 
1171 | 	o = out;
1172 | 	STBIW_MEMMOVE(o, sig, 8); o += 8;
1173 | 	stbiw__wp32(o, 13); // header length
1174 | 	stbiw__wptag(o, "IHDR");
1175 | 	stbiw__wp32(o, x);
1176 | 	stbiw__wp32(o, y);
1177 | 	*o++ = 8;
1178 | 	*o++ = STBIW_UCHAR(ctype[n]);
1179 | 	*o++ = 0;
1180 | 	*o++ = 0;
1181 | 	*o++ = 0;
1182 | 	stbiw__wpcrc(&o, 13);
1183 | 
1184 | 	stbiw__wp32(o, zlen);
1185 | 	stbiw__wptag(o, "IDAT");
1186 | 	STBIW_MEMMOVE(o, zlib, zlen);
1187 | 	o += zlen;
1188 | 	STBIW_FREE(zlib);
1189 | 	stbiw__wpcrc(&o, zlen);
1190 | 
1191 | 	stbiw__wp32(o, 0);
1192 | 	stbiw__wptag(o, "IEND");
1193 | 	stbiw__wpcrc(&o, 0);
1194 | 
1195 | 	STBIW_ASSERT(o == out + *out_len);
1196 | 
1197 | 	return out;
1198 | }
1199 | 
1200 | #ifndef STBI_WRITE_NO_STDIO
1201 | STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
1202 | {
1203 | 	FILE *f;
1204 | 	int len;
1205 | 	unsigned char *png = stbi_write_png_to_mem((unsigned char *)data, stride_bytes, x, y, comp, &len);
1206 | 	if( png == NULL ) return 0;
1207 | #ifdef STBI_MSC_SECURE_CRT
1208 | 	if( fopen_s(&f, filename, "wb") )
1209 | 		f = NULL;
1210 | #else
1211 | 	f = fopen(filename, "wb");
1212 | #endif
1213 | 	if( !f ) { STBIW_FREE(png); return 0; }
1214 | 	fwrite(png, 1, len, f);
1215 | 	fclose(f);
1216 | 	STBIW_FREE(png);
1217 | 	return 1;
1218 | }
1219 | #endif
1220 | 
1221 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
1222 | {
1223 | 	int len;
1224 | 	unsigned char *png = stbi_write_png_to_mem((unsigned char *)data, stride_bytes, x, y, comp, &len);
1225 | 	if( png == NULL ) return 0;
1226 | 	func(context, png, len);
1227 | 	STBIW_FREE(png);
1228 | 	return 1;
1229 | }
1230 | 
1231 | 
1232 | /* ***************************************************************************
1233 | *
1234 | * JPEG writer
1235 | *
1236 | * This is based on Jon Olick's jo_jpeg.cpp:
1237 | * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
1238 | */
1239 | 
1240 | static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
1241 | 24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
1242 | 
1243 | static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs)
1244 | {
1245 | 	int bitBuf = *bitBufP, bitCnt = *bitCntP;
1246 | 	bitCnt += bs[1];
1247 | 	bitBuf |= bs[0] << (24 - bitCnt);
1248 | 	while( bitCnt >= 8 )
1249 | 	{
1250 | 		unsigned char c = (bitBuf >> 16) & 255;
1251 | 		stbiw__putc(s, c);
1252 | 		if( c == 255 )
1253 | 		{
1254 | 			stbiw__putc(s, 0);
1255 | 		}
1256 | 		bitBuf <<= 8;
1257 | 		bitCnt -= 8;
1258 | 	}
1259 | 	*bitBufP = bitBuf;
1260 | 	*bitCntP = bitCnt;
1261 | }
1262 | 
1263 | static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p)
1264 | {
1265 | 	float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
1266 | 	float z1, z2, z3, z4, z5, z11, z13;
1267 | 
1268 | 	float tmp0 = d0 + d7;
1269 | 	float tmp7 = d0 - d7;
1270 | 	float tmp1 = d1 + d6;
1271 | 	float tmp6 = d1 - d6;
1272 | 	float tmp2 = d2 + d5;
1273 | 	float tmp5 = d2 - d5;
1274 | 	float tmp3 = d3 + d4;
1275 | 	float tmp4 = d3 - d4;
1276 | 
1277 | 	// Even part
1278 | 	float tmp10 = tmp0 + tmp3;   // phase 2
1279 | 	float tmp13 = tmp0 - tmp3;
1280 | 	float tmp11 = tmp1 + tmp2;
1281 | 	float tmp12 = tmp1 - tmp2;
1282 | 
1283 | 	d0 = tmp10 + tmp11;       // phase 3
1284 | 	d4 = tmp10 - tmp11;
1285 | 
1286 | 	z1 = (tmp12 + tmp13) * 0.707106781f; // c4
1287 | 	d2 = tmp13 + z1;       // phase 5
1288 | 	d6 = tmp13 - z1;
1289 | 
1290 | 	// Odd part
1291 | 	tmp10 = tmp4 + tmp5;       // phase 2
1292 | 	tmp11 = tmp5 + tmp6;
1293 | 	tmp12 = tmp6 + tmp7;
1294 | 
1295 | 	// The rotator is modified from fig 4-8 to avoid extra negations.
1296 | 	z5 = (tmp10 - tmp12) * 0.382683433f; // c6
1297 | 	z2 = tmp10 * 0.541196100f + z5; // c2-c6
1298 | 	z4 = tmp12 * 1.306562965f + z5; // c2+c6
1299 | 	z3 = tmp11 * 0.707106781f; // c4
1300 | 
1301 | 	z11 = tmp7 + z3;      // phase 5
1302 | 	z13 = tmp7 - z3;
1303 | 
1304 | 	*d5p = z13 + z2;         // phase 6
1305 | 	*d3p = z13 - z2;
1306 | 	*d1p = z11 + z4;
1307 | 	*d7p = z11 - z4;
1308 | 
1309 | 	*d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
1310 | }
1311 | 
1312 | static void stbiw__jpg_calcBits(int val, unsigned short bits[2])
1313 | {
1314 | 	int tmp1 = val < 0 ? -val : val;
1315 | 	val = val < 0 ? val - 1 : val;
1316 | 	bits[1] = 1;
1317 | 	while( tmp1 >>= 1 )
1318 | 	{
1319 | 		++bits[1];
1320 | 	}
1321 | 	bits[0] = val & ((1 << bits[1]) - 1);
1322 | }
1323 | 
1324 | static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2])
1325 | {
1326 | 	const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
1327 | 	const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
1328 | 	int dataOff, i, diff, end0pos;
1329 | 	int DU[64];
1330 | 
1331 | 	// DCT rows
1332 | 	for( dataOff = 0; dataOff<64; dataOff += 8 )
1333 | 	{
1334 | 		stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], &CDU[dataOff + 3], &CDU[dataOff + 4], &CDU[dataOff + 5], &CDU[dataOff + 6], &CDU[dataOff + 7]);
1335 | 	}
1336 | 	// DCT columns
1337 | 	for( dataOff = 0; dataOff<8; ++dataOff )
1338 | 	{
1339 | 		stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16], &CDU[dataOff + 24], &CDU[dataOff + 32], &CDU[dataOff + 40], &CDU[dataOff + 48], &CDU[dataOff + 56]);
1340 | 	}
1341 | 	// Quantize/descale/zigzag the coefficients
1342 | 	for( i = 0; i<64; ++i )
1343 | 	{
1344 | 		float v = CDU[i] * fdtbl[i];
1345 | 		// DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
1346 | 		// ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
1347 | 		DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
1348 | 	}
1349 | 
1350 | 	// Encode DC
1351 | 	diff = DU[0] - DC;
1352 | 	if( diff == 0 )
1353 | 	{
1354 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
1355 | 	}
1356 | 	else
1357 | 	{
1358 | 		unsigned short bits[2];
1359 | 		stbiw__jpg_calcBits(diff, bits);
1360 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
1361 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
1362 | 	}
1363 | 	// Encode ACs
1364 | 	end0pos = 63;
1365 | 	for( ; (end0pos>0) && (DU[end0pos] == 0); --end0pos )
1366 | 	{
1367 | 	}
1368 | 	// end0pos = first element in reverse order !=0
1369 | 	if( end0pos == 0 )
1370 | 	{
1371 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
1372 | 		return DU[0];
1373 | 	}
1374 | 	for( i = 1; i <= end0pos; ++i )
1375 | 	{
1376 | 		int startpos = i;
1377 | 		int nrzeroes;
1378 | 		unsigned short bits[2];
1379 | 		for( ; DU[i] == 0 && i <= end0pos; ++i )
1380 | 		{
1381 | 		}
1382 | 		nrzeroes = i - startpos;
1383 | 		if( nrzeroes >= 16 )
1384 | 		{
1385 | 			int lng = nrzeroes >> 4;
1386 | 			int nrmarker;
1387 | 			for( nrmarker = 1; nrmarker <= lng; ++nrmarker )
1388 | 				stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
1389 | 			nrzeroes &= 15;
1390 | 		}
1391 | 		stbiw__jpg_calcBits(DU[i], bits);
1392 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]);
1393 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
1394 | 	}
1395 | 	if( end0pos != 63 )
1396 | 	{
1397 | 		stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
1398 | 	}
1399 | 	return DU[0];
1400 | }
1401 | 
1402 | static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality)
1403 | {
1404 | 	// Constants that don't pollute global namespace
1405 | 	static const unsigned char std_dc_luminance_nrcodes[] = { 0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0 };
1406 | 	static const unsigned char std_dc_luminance_values[] = { 0,1,2,3,4,5,6,7,8,9,10,11 };
1407 | 	static const unsigned char std_ac_luminance_nrcodes[] = { 0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d };
1408 | 	static const unsigned char std_ac_luminance_values[] = {
1409 | 		0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
1410 | 		0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
1411 | 		0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
1412 | 		0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
1413 | 		0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
1414 | 		0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
1415 | 		0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
1416 | 	};
1417 | 	static const unsigned char std_dc_chrominance_nrcodes[] = { 0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0 };
1418 | 	static const unsigned char std_dc_chrominance_values[] = { 0,1,2,3,4,5,6,7,8,9,10,11 };
1419 | 	static const unsigned char std_ac_chrominance_nrcodes[] = { 0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77 };
1420 | 	static const unsigned char std_ac_chrominance_values[] = {
1421 | 		0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
1422 | 		0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
1423 | 		0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
1424 | 		0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
1425 | 		0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
1426 | 		0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
1427 | 		0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
1428 | 	};
1429 | 	// Huffman tables
1430 | 	static const unsigned short YDC_HT[256][2] = { { 0,2 },{ 2,3 },{ 3,3 },{ 4,3 },{ 5,3 },{ 6,3 },{ 14,4 },{ 30,5 },{ 62,6 },{ 126,7 },{ 254,8 },{ 510,9 } };
1431 | 	static const unsigned short UVDC_HT[256][2] = { { 0,2 },{ 1,2 },{ 2,2 },{ 6,3 },{ 14,4 },{ 30,5 },{ 62,6 },{ 126,7 },{ 254,8 },{ 510,9 },{ 1022,10 },{ 2046,11 } };
1432 | 	static const unsigned short YAC_HT[256][2] = {
1433 | 		{ 10,4 },{ 0,2 },{ 1,2 },{ 4,3 },{ 11,4 },{ 26,5 },{ 120,7 },{ 248,8 },{ 1014,10 },{ 65410,16 },{ 65411,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1434 | 	{ 12,4 },{ 27,5 },{ 121,7 },{ 502,9 },{ 2038,11 },{ 65412,16 },{ 65413,16 },{ 65414,16 },{ 65415,16 },{ 65416,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1435 | 	{ 28,5 },{ 249,8 },{ 1015,10 },{ 4084,12 },{ 65417,16 },{ 65418,16 },{ 65419,16 },{ 65420,16 },{ 65421,16 },{ 65422,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1436 | 	{ 58,6 },{ 503,9 },{ 4085,12 },{ 65423,16 },{ 65424,16 },{ 65425,16 },{ 65426,16 },{ 65427,16 },{ 65428,16 },{ 65429,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1437 | 	{ 59,6 },{ 1016,10 },{ 65430,16 },{ 65431,16 },{ 65432,16 },{ 65433,16 },{ 65434,16 },{ 65435,16 },{ 65436,16 },{ 65437,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1438 | 	{ 122,7 },{ 2039,11 },{ 65438,16 },{ 65439,16 },{ 65440,16 },{ 65441,16 },{ 65442,16 },{ 65443,16 },{ 65444,16 },{ 65445,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1439 | 	{ 123,7 },{ 4086,12 },{ 65446,16 },{ 65447,16 },{ 65448,16 },{ 65449,16 },{ 65450,16 },{ 65451,16 },{ 65452,16 },{ 65453,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1440 | 	{ 250,8 },{ 4087,12 },{ 65454,16 },{ 65455,16 },{ 65456,16 },{ 65457,16 },{ 65458,16 },{ 65459,16 },{ 65460,16 },{ 65461,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1441 | 	{ 504,9 },{ 32704,15 },{ 65462,16 },{ 65463,16 },{ 65464,16 },{ 65465,16 },{ 65466,16 },{ 65467,16 },{ 65468,16 },{ 65469,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1442 | 	{ 505,9 },{ 65470,16 },{ 65471,16 },{ 65472,16 },{ 65473,16 },{ 65474,16 },{ 65475,16 },{ 65476,16 },{ 65477,16 },{ 65478,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1443 | 	{ 506,9 },{ 65479,16 },{ 65480,16 },{ 65481,16 },{ 65482,16 },{ 65483,16 },{ 65484,16 },{ 65485,16 },{ 65486,16 },{ 65487,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1444 | 	{ 1017,10 },{ 65488,16 },{ 65489,16 },{ 65490,16 },{ 65491,16 },{ 65492,16 },{ 65493,16 },{ 65494,16 },{ 65495,16 },{ 65496,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1445 | 	{ 1018,10 },{ 65497,16 },{ 65498,16 },{ 65499,16 },{ 65500,16 },{ 65501,16 },{ 65502,16 },{ 65503,16 },{ 65504,16 },{ 65505,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1446 | 	{ 2040,11 },{ 65506,16 },{ 65507,16 },{ 65508,16 },{ 65509,16 },{ 65510,16 },{ 65511,16 },{ 65512,16 },{ 65513,16 },{ 65514,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1447 | 	{ 65515,16 },{ 65516,16 },{ 65517,16 },{ 65518,16 },{ 65519,16 },{ 65520,16 },{ 65521,16 },{ 65522,16 },{ 65523,16 },{ 65524,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1448 | 	{ 2041,11 },{ 65525,16 },{ 65526,16 },{ 65527,16 },{ 65528,16 },{ 65529,16 },{ 65530,16 },{ 65531,16 },{ 65532,16 },{ 65533,16 },{ 65534,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 }
1449 | 	};
1450 | 	static const unsigned short UVAC_HT[256][2] = {
1451 | 		{ 0,2 },{ 1,2 },{ 4,3 },{ 10,4 },{ 24,5 },{ 25,5 },{ 56,6 },{ 120,7 },{ 500,9 },{ 1014,10 },{ 4084,12 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1452 | 	{ 11,4 },{ 57,6 },{ 246,8 },{ 501,9 },{ 2038,11 },{ 4085,12 },{ 65416,16 },{ 65417,16 },{ 65418,16 },{ 65419,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1453 | 	{ 26,5 },{ 247,8 },{ 1015,10 },{ 4086,12 },{ 32706,15 },{ 65420,16 },{ 65421,16 },{ 65422,16 },{ 65423,16 },{ 65424,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1454 | 	{ 27,5 },{ 248,8 },{ 1016,10 },{ 4087,12 },{ 65425,16 },{ 65426,16 },{ 65427,16 },{ 65428,16 },{ 65429,16 },{ 65430,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1455 | 	{ 58,6 },{ 502,9 },{ 65431,16 },{ 65432,16 },{ 65433,16 },{ 65434,16 },{ 65435,16 },{ 65436,16 },{ 65437,16 },{ 65438,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1456 | 	{ 59,6 },{ 1017,10 },{ 65439,16 },{ 65440,16 },{ 65441,16 },{ 65442,16 },{ 65443,16 },{ 65444,16 },{ 65445,16 },{ 65446,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1457 | 	{ 121,7 },{ 2039,11 },{ 65447,16 },{ 65448,16 },{ 65449,16 },{ 65450,16 },{ 65451,16 },{ 65452,16 },{ 65453,16 },{ 65454,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1458 | 	{ 122,7 },{ 2040,11 },{ 65455,16 },{ 65456,16 },{ 65457,16 },{ 65458,16 },{ 65459,16 },{ 65460,16 },{ 65461,16 },{ 65462,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1459 | 	{ 249,8 },{ 65463,16 },{ 65464,16 },{ 65465,16 },{ 65466,16 },{ 65467,16 },{ 65468,16 },{ 65469,16 },{ 65470,16 },{ 65471,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1460 | 	{ 503,9 },{ 65472,16 },{ 65473,16 },{ 65474,16 },{ 65475,16 },{ 65476,16 },{ 65477,16 },{ 65478,16 },{ 65479,16 },{ 65480,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1461 | 	{ 504,9 },{ 65481,16 },{ 65482,16 },{ 65483,16 },{ 65484,16 },{ 65485,16 },{ 65486,16 },{ 65487,16 },{ 65488,16 },{ 65489,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1462 | 	{ 505,9 },{ 65490,16 },{ 65491,16 },{ 65492,16 },{ 65493,16 },{ 65494,16 },{ 65495,16 },{ 65496,16 },{ 65497,16 },{ 65498,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1463 | 	{ 506,9 },{ 65499,16 },{ 65500,16 },{ 65501,16 },{ 65502,16 },{ 65503,16 },{ 65504,16 },{ 65505,16 },{ 65506,16 },{ 65507,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1464 | 	{ 2041,11 },{ 65508,16 },{ 65509,16 },{ 65510,16 },{ 65511,16 },{ 65512,16 },{ 65513,16 },{ 65514,16 },{ 65515,16 },{ 65516,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1465 | 	{ 16352,14 },{ 65517,16 },{ 65518,16 },{ 65519,16 },{ 65520,16 },{ 65521,16 },{ 65522,16 },{ 65523,16 },{ 65524,16 },{ 65525,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },
1466 | 	{ 1018,10 },{ 32707,15 },{ 65526,16 },{ 65527,16 },{ 65528,16 },{ 65529,16 },{ 65530,16 },{ 65531,16 },{ 65532,16 },{ 65533,16 },{ 65534,16 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 },{ 0,0 }
1467 | 	};
1468 | 	static const int YQT[] = { 16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
1469 | 		37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99 };
1470 | 	static const int UVQT[] = { 17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
1471 | 		99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99 };
1472 | 	static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
1473 | 		1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
1474 | 
1475 | 	int row, col, i, k;
1476 | 	float fdtbl_Y[64], fdtbl_UV[64];
1477 | 	unsigned char YTable[64], UVTable[64];
1478 | 
1479 | 	if( !data || !width || !height || comp > 4 || comp < 1 )
1480 | 	{
1481 | 		return 0;
1482 | 	}
1483 | 
1484 | 	quality = quality ? quality : 90;
1485 | 	quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
1486 | 	quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
1487 | 
1488 | 	for( i = 0; i < 64; ++i )
1489 | 	{
1490 | 		int uvti, yti = (YQT[i] * quality + 50) / 100;
1491 | 		YTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 : yti);
1492 | 		uvti = (UVQT[i] * quality + 50) / 100;
1493 | 		UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
1494 | 	}
1495 | 
1496 | 	for( row = 0, k = 0; row < 8; ++row )
1497 | 	{
1498 | 		for( col = 0; col < 8; ++col, ++k )
1499 | 		{
1500 | 			fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
1501 | 			fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
1502 | 		}
1503 | 	}
1504 | 
1505 | 	// Write Headers
1506 | 	{
1507 | 		static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
1508 | 		static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
1509 | 		const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height >> 8),STBIW_UCHAR(height),(unsigned char)(width >> 8),STBIW_UCHAR(width),
1510 | 			3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
1511 | 		s->func(s->context, (void*)head0, sizeof(head0));
1512 | 		s->func(s->context, (void*)YTable, sizeof(YTable));
1513 | 		stbiw__putc(s, 1);
1514 | 		s->func(s->context, UVTable, sizeof(UVTable));
1515 | 		s->func(s->context, (void*)head1, sizeof(head1));
1516 | 		s->func(s->context, (void*)(std_dc_luminance_nrcodes + 1), sizeof(std_dc_luminance_nrcodes) - 1);
1517 | 		s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
1518 | 		stbiw__putc(s, 0x10); // HTYACinfo
1519 | 		s->func(s->context, (void*)(std_ac_luminance_nrcodes + 1), sizeof(std_ac_luminance_nrcodes) - 1);
1520 | 		s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
1521 | 		stbiw__putc(s, 1); // HTUDCinfo
1522 | 		s->func(s->context, (void*)(std_dc_chrominance_nrcodes + 1), sizeof(std_dc_chrominance_nrcodes) - 1);
1523 | 		s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
1524 | 		stbiw__putc(s, 0x11); // HTUACinfo
1525 | 		s->func(s->context, (void*)(std_ac_chrominance_nrcodes + 1), sizeof(std_ac_chrominance_nrcodes) - 1);
1526 | 		s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
1527 | 		s->func(s->context, (void*)head2, sizeof(head2));
1528 | 	}
1529 | 
1530 | 	// Encode 8x8 macroblocks
1531 | 	{
1532 | 		static const unsigned short fillBits[] = { 0x7F, 7 };
1533 | 		const unsigned char *imageData = (const unsigned char *)data;
1534 | 		int DCY = 0, DCU = 0, DCV = 0;
1535 | 		int bitBuf = 0, bitCnt = 0;
1536 | 		// comp == 2 is grey+alpha (alpha is ignored)
1537 | 		int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
1538 | 		int x, y, pos;
1539 | 		for( y = 0; y < height; y += 8 )
1540 | 		{
1541 | 			for( x = 0; x < width; x += 8 )
1542 | 			{
1543 | 				float YDU[64], UDU[64], VDU[64];
1544 | 				for( row = y, pos = 0; row < y + 8; ++row )
1545 | 				{
1546 | 					for( col = x; col < x + 8; ++col, ++pos )
1547 | 					{
1548 | 						int p = (stbi__flip_vertically_on_write ? height - 1 - row : row)*width*comp + col * comp;
1549 | 						float r, g, b;
1550 | 						if( row >= height )
1551 | 						{
1552 | 							p -= width * comp*(row + 1 - height);
1553 | 						}
1554 | 						if( col >= width )
1555 | 						{
1556 | 							p -= comp * (col + 1 - width);
1557 | 						}
1558 | 
1559 | 						r = imageData[p + 0];
1560 | 						g = imageData[p + ofsG];
1561 | 						b = imageData[p + ofsB];
1562 | 						YDU[pos] = +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
1563 | 						UDU[pos] = -0.16874f*r - 0.33126f*g + 0.50000f*b;
1564 | 						VDU[pos] = +0.50000f*r - 0.41869f*g - 0.08131f*b;
1565 | 					}
1566 | 				}
1567 | 
1568 | 				DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1569 | 				DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
1570 | 				DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
1571 | 			}
1572 | 		}
1573 | 
1574 | 		// Do the bit alignment of the EOI marker
1575 | 		stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
1576 | 	}
1577 | 
1578 | 	// EOI
1579 | 	stbiw__putc(s, 0xFF);
1580 | 	stbiw__putc(s, 0xD9);
1581 | 
1582 | 	return 1;
1583 | }
1584 | 
1585 | STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
1586 | {
1587 | 	stbi__write_context s;
1588 | 	stbi__start_write_callbacks(&s, func, context);
1589 | 	return stbi_write_jpg_core(&s, x, y, comp, (void *)data, quality);
1590 | }
1591 | 
1592 | 
1593 | #ifndef STBI_WRITE_NO_STDIO
1594 | STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
1595 | {
1596 | 	stbi__write_context s;
1597 | 	if( stbi__start_write_file(&s, filename) )
1598 | 	{
1599 | 		int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
1600 | 		stbi__end_write_file(&s);
1601 | 		return r;
1602 | 	}
1603 | 	else
1604 | 		return 0;
1605 | }
1606 | #endif
1607 | 
1608 | #endif // STB_IMAGE_WRITE_IMPLEMENTATION
1609 | 
1610 | /* Revision history
1611 | 1.09  (2018-02-11)
1612 | fix typo in zlib quality API, improve STB_I_W_STATIC in C++
1613 | 1.08  (2018-01-29)
1614 | add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
1615 | 1.07  (2017-07-24)
1616 | doc fix
1617 | 1.06 (2017-07-23)
1618 | writing JPEG (using Jon Olick's code)
1619 | 1.05   ???
1620 | 1.04 (2017-03-03)
1621 | monochrome BMP expansion
1622 | 1.03   ???
1623 | 1.02 (2016-04-02)
1624 | avoid allocating large structures on the stack
1625 | 1.01 (2016-01-16)
1626 | STBIW_REALLOC_SIZED: support allocators with no realloc support
1627 | avoid race-condition in crc initialization
1628 | minor compile issues
1629 | 1.00 (2015-09-14)
1630 | installable file IO function
1631 | 0.99 (2015-09-13)
1632 | warning fixes; TGA rle support
1633 | 0.98 (2015-04-08)
1634 | added STBIW_MALLOC, STBIW_ASSERT etc
1635 | 0.97 (2015-01-18)
1636 | fixed HDR asserts, rewrote HDR rle logic
1637 | 0.96 (2015-01-17)
1638 | add HDR output
1639 | fix monochrome BMP
1640 | 0.95 (2014-08-17)
1641 | add monochrome TGA output
1642 | 0.94 (2014-05-31)
1643 | rename private functions to avoid conflicts with stb_image.h
1644 | 0.93 (2014-05-27)
1645 | warning fixes
1646 | 0.92 (2010-08-01)
1647 | casts to unsigned char to fix warnings
1648 | 0.91 (2010-07-17)
1649 | first public release
1650 | 0.90   first internal release
1651 | */
1652 | 
1653 | /*
1654 | ------------------------------------------------------------------------------
1655 | This software is available under 2 licenses -- choose whichever you prefer.
1656 | ------------------------------------------------------------------------------
1657 | ALTERNATIVE A - MIT License
1658 | Copyright (c) 2017 Sean Barrett
1659 | Permission is hereby granted, free of charge, to any person obtaining a copy of
1660 | this software and associated documentation files (the "Software"), to deal in
1661 | the Software without restriction, including without limitation the rights to
1662 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
1663 | of the Software, and to permit persons to whom the Software is furnished to do
1664 | so, subject to the following conditions:
1665 | The above copyright notice and this permission notice shall be included in all
1666 | copies or substantial portions of the Software.
1667 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1668 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1669 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1670 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1671 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1672 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1673 | SOFTWARE.
1674 | ------------------------------------------------------------------------------
1675 | ALTERNATIVE B - Public Domain (www.unlicense.org)
1676 | This is free and unencumbered software released into the public domain.
1677 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
1678 | software, either in source code form or as a compiled binary, for any purpose,
1679 | commercial or non-commercial, and by any means.
1680 | In jurisdictions that recognize copyright laws, the author or authors of this
1681 | software dedicate any and all copyright interest in the software to the public
1682 | domain. We make this dedication for the benefit of the public at large and to
1683 | the detriment of our heirs and successors. We intend this dedication to be an
1684 | overt act of relinquishment in perpetuity of all present and future rights to
1685 | this software under copyright law.
1686 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1687 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1688 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1689 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
1690 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1691 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1692 | ------------------------------------------------------------------------------
1693 | */


--------------------------------------------------------------------------------