├── CMakeLists.txt
├── LICENSE
├── README
├── bc7decomp.c
├── bc7decomp.h
├── bc7enc.cpp
├── bc7enc16.c
├── bc7enc16.h
├── build_msvc.cmd
├── dds_defs.h
├── ktx_defs.h
├── lodepng.cpp
└── lodepng.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(bc7enc)
 2 | 
 3 | cmake_minimum_required(VERSION 2.8)
 4 | option(BUILD_X64 "build 64-bit" TRUE)
 5 | 
 6 | message("Initial BUILD_X64=${BUILD_X64}")
 7 | message("Initial CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
 8 | 
 9 | if( NOT CMAKE_BUILD_TYPE )
10 |   set( CMAKE_BUILD_TYPE Release )
11 | endif( NOT CMAKE_BUILD_TYPE )
12 | 
13 | message( ${PROJECT_NAME} " build type: " ${CMAKE_BUILD_TYPE} )
14 | 
15 | if (BUILD_X64)
16 | 	message("Building 64-bit")
17 | else()
18 | 	message("Building 32-bit")
19 | endif(BUILD_X64)
20 | 
21 | if (NOT MSVC)
22 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
23 | set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
24 | 
25 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
26 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
27 | endif()
28 | 
29 | # -fno-strict-aliasing shouldn't be necessary, it's here because that is what MSVC uses by default and that's what I've tested with the most.
30 | if (NOT MSVC)
31 | 	set(GCC_COMPILE_FLAGS "-fno-strict-aliasing -Wall -Wextra")
32 | 	if (NOT BUILD_X64)
33 | 		set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -m32")
34 | 	endif()
35 | endif()	
36 | 
37 | set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
38 | set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
39 | set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
40 | 
41 | set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COMPILE_FLAGS}")
42 | set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
43 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
44 | 
45 | set (BC7ENC_SRC_LIST ${COMMON_SRC_LIST}
46 | 	bc7enc.cpp
47 | 	lodepng.cpp
48 | 	bc7decomp.c
49 | 	bc7enc16.c
50 | 	)
51 | 	
52 | add_executable(bc7enc ${BC7ENC_SRC_LIST})
53 | 
54 | if (NOT MSVC)
55 | 	target_link_libraries(bc7enc m)
56 | endif()
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | bc7enc16.c/.h is available under 2 licenses -- choose whichever you prefer:
 2 | 
 3 | ALTERNATIVE A for bc7enc.c/.h - MIT License
 4 | Copyright(c) 2018 Richard Geldreich, Jr.
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files(the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
 9 | of the Software, and to permit persons to whom the Software is furnished to do
10 | so, subject to the following conditions :
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | ------------------------------------------------------------------------------
21 | ALTERNATIVE B for bc7enc.c/.h - Public Domain(www.unlicense.org)
22 | This is free and unencumbered software released into the public domain.
23 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
24 | software, either in source code form or as a compiled binary, for any purpose,
25 | commercial or non - commercial, and by any means.
26 | In jurisdictions that recognize copyright laws, the author or authors of this
27 | software dedicate any and all copyright interest in the software to the public
28 | domain.We make this dedication for the benefit of the public at large and to
29 | the detriment of our heirs and successors.We intend this dedication to be an
30 | overt act of relinquishment in perpetuity of all present and future rights to
31 | this software under copyright law.
32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
35 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
36 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 | ------------------------------------------------------------------------------
39 | 
40 | bc7decomp.c/.h: Copyright (c) 2015 Harm Hanemaaijer <fgenfb@yahoo.com>
41 | Permission to use, copy, modify, and/or distribute this software for any
42 | purpose with or without fee is hereby granted, provided that the above
43 | copyright notice and this permission notice appear in all copies.
44 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
45 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
46 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
47 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
48 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
49 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
50 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
51 | 
52 | ------------------------------------------------------------------------------
53 | 
54 | LodePNG version 20161127
55 | 
56 | Copyright (c) 2005-2016 Lode Vandevenne
57 | 
58 | This software is provided 'as-is', without any express or implied
59 | warranty. In no event will the authors be held liable for any damages
60 | arising from the use of this software.
61 | 
62 | Permission is granted to anyone to use this software for any purpose,
63 | including commercial applications, and to alter it and redistribute it
64 | freely, subject to the following restrictions:
65 | 
66 | 	 1. The origin of this software must not be misrepresented; you must not
67 | 	 claim that you wrote the original software. If you use this software
68 | 	 in a product, an acknowledgment in the product documentation would be
69 | 	 appreciated but is not required.
70 | 
71 | 	 2. Altered source versions must be plainly marked as such, and must not be
72 | 	 misrepresented as being the original software.
73 | 
74 | 	 3. This notice may not be removed or altered from any source
75 | 	 distribution.
76 | 	 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | bc7enc16 - Fast, single source file BC7/BPTC GPU texture encoder with perceptual colorspace metric support
  2 | 
  3 | Note: Since this repo was created, we've released two new codecs with better BC7 encoders:
  4 | https://github.com/richgel999/bc7enc_rdo
  5 | https://github.com/BinomialLLC/bc7e
  6 | 
  7 | bc7enc16 purposely only supports modes 1 and 6. This is a strong opaque texture encoder, with basic
  8 | support for alpha channels (using mode 6). The intended use case is opaque textures, or opaque textures 
  9 | with relatively simple alpha channels. It also acts as a relatively simple to understand example.
 10 | 
 11 | If alpha is highly correlated compared to RGB, or alpha is relatively simple
 12 | (think simple masks where lots of blocks are either all-transparent or
 13 | all-opaque), it should work great. For complex alpha channels more modes (such
 14 | as 4, 5 or maybe 7) are necessary.
 15 | 
 16 | This codec supports a perceptual mode, where it computes colorspace error in
 17 | weighted YCbCr space (like etc2comp), and it also supports weighted RGBA
 18 | metrics. It's particular strong in perceptual mode, beating the current state of
 19 | the art CPU encoder (Intel's ispc_texcomp) by a wide margin when measured by
 20 | Luma PSNR, even though it only supports 2 modes and isn't vectorized.
 21 | 
 22 | Why only modes 1 and 6?
 23 | Because with these two modes you have a complete encoder that supports both
 24 | opaque and transparent textures in a small amount (~1400 lines) of
 25 | understandable plain C code. Mode 6 excels on smooth blocks, and mode 1 is
 26 | strong with complex blocks, and a strong encoder that combines both modes can be
 27 | quite high quality. Fast mode 6-only encoders will have noticeable block
 28 | artifacts which this codec avoids by fully supporting mode 1.
 29 | 
 30 | Modes 1 and 6 are typically the most used modes on many textures using other
 31 | encoders. Mode 1 has two subsets, 64 possible partitions, and 3-bit indices,
 32 | while mode 6 has large 4-bit indices and high precision 7777.1 endpoints. This
 33 | codec produces output that is far higher quality than any BC1 encoder, and
 34 | approaches (or in perceptual mode exceeds!) the quality of other full BC7
 35 | encoders.
 36 | 
 37 | Why is bc7enc16 so fast in perceptual mode?
 38 | Computing error in YCbCr space is more expensive than in RGB space, yet bc7enc16
 39 | in perceptual mode is stronger than ispc_texcomp (see the benchmark below) -
 40 | even without SSE/AVX vectorization and with only 2 modes to work with!
 41 | 
 42 | Most BC7 encoders only support linear RGB colorspace metrics, which is a
 43 | fundamental weakness. Some support weighted RGB metrics, which is better. With
 44 | linear RGB metrics, encoding error is roughly balanced between each channel, and
 45 | encoders have to work *very* hard (examining large amounts of RGB search space)
 46 | to get overall quality up. With perceptual colorspace metrics, RGB error tends
 47 | to become a bit unbalanced, with green quality favored more highly than red and
 48 | blue, and blue quality favored the least. A perceptual encoder is tuned to
 49 | prefer exploring solutions along the luma axis, where it's much less work to find
 50 | solutions with less luma error. bc7enc16 is, as far as I know, the first BC7
 51 | codec to support computing error in weighted YCbCr colorspace.
 52 | 
 53 | Note: Most of the timings here (except for the ispc_texcomp "fast" mode timings at the very bottom)
 54 | are for the *original* release, before I added several more optimizations. The latest version of 
 55 | bc7enc16.c is around 8-27% faster than the initial release at same quality (when mode 1 is enabled - 
 56 | there's no change with just mode 6).
 57 | 
 58 | Some benchmarks across 31 images (kodim corpus+others):
 59 | 
 60 | Perceptual (average REC709 Luma PSNR - higher is better quality):
 61 | 
 62 | iscp_texcomp slow vs. bc7enc16 uber4/max_partitions 64
 63 | iscp_texcomp:   355.4 secs 48.6 dB
 64 | bc7enc16:       122.6 secs 50.0 dB
 65 | 
 66 | iscp_texcomp slow vs. bc7enc16 uber0/max_partitions 64
 67 | iscp_texcomp:   355.4 secs 48.6 dB
 68 | bc7enc16:       38.3 secs 49.6 dB
 69 | 
 70 | iscp_texcomp basic vs. bc7enc16 uber0/max_partitions 16
 71 | ispc_texcomp:   100.2 secs 48.3 dB
 72 | bc7enc16:       20.8 secs 49.3 dB 
 73 | 
 74 | iscp_texcomp fast vs. bc7enc16 uber0/max_partitions 16
 75 | iscp_texcomp:   41.5 secs 48.0 dB 
 76 | bc7enc16:       20.8 secs 49.3 dB
 77 | 
 78 | iscp_texcomp ultrafast vs. bc7enc16 uber0/max_partitions 0
 79 | iscp_texcomp:   1.9 secs 46.2 dB
 80 | bc7enc16:       8.9 secs 48.4 dB 
 81 | 
 82 | Non-perceptual (average RGB PSNR):
 83 | 
 84 | iscp_texcomp slow vs. bc7enc16 uber4/max_partitions 64
 85 | iscp_texcomp:   355.4 secs 46.8 dB 
 86 | bc7enc16:       51 secs 46.1 dB
 87 | 
 88 | iscp_texcomp slow vs. bc7enc16 uber0/max_partitions 64
 89 | iscp_texcomp:   355.4 secs 46.8 dB
 90 | bc7enc16:       29.3 secs 45.8 dB
 91 | 
 92 | iscp_texcomp basic vs. bc7enc16 uber4/max_partitions 64
 93 | iscp_texcomp:   99.9 secs 46.5 dB
 94 | bc7enc16:       51 secs 46.1 dB
 95 | 
 96 | iscp_texcomp fast vs. bc7enc16 uber1/max_partitions 16
 97 | ispc_texcomp:   41.5 secs 46.1 dB
 98 | bc7enc16:       19.8 secs 45.5 dB
 99 | 
100 | iscp_texcomp fast vs. bc7enc16 uber0/max_partitions 8
101 | ispc_texcomp:   41.5 secs 46.1 dB
102 | bc7enc16:       10.46 secs 44.4 dB
103 | 
104 | iscp_texcomp ultrafast vs. bc7enc16 uber0/max_partitions 0
105 | ispc_texcomp:   1.9 secs 42.7 dB 
106 | bc7enc16:       3.8 secs 42.7 dB
107 | 
108 | DirectXTex CPU in "mode 6 only" mode vs. bc7enc16 uber1/max_partions 0 (mode 6 only), non-perceptual:
109 | 
110 | DirectXTex:     466.4 secs 41.9 dB 
111 | bc7enc16:       6.7 secs 42.8 dB
112 | 
113 | DirectXTex CPU in (default - no 3 subset modes) vs. bc7enc16 uber1/max_partions 64, non-perceptual:
114 | 
115 | DirectXTex:     9485.1 secs 45.6 dB 
116 | bc7enc16:       36 secs 46.0 dB
117 | 
118 | (Note this version of DirectXTex has a key pbit bugfix which I've submitted but
119 | is still waiting to be accepted. Non-bugfixed versions will be slightly lower
120 | quality.)
121 | 
122 | UPDATE: To illustrate how strong the mode 1+6 implementation is in bc7enc16, let's compare ispc_texcomp 
123 | fast vs. the latest version of bc7enc16 uber4/max_partitions 64:
124 | 
125 | Without filterbank optimizations:
126 | 
127 |                 Time       RGB PSNR   Y PSNR
128 | ispc_texcomp:   41.45 secs 46.09 dB   48.0 dB
129 | bc7enc16:       41.42 secs 46.03 dB   48.2 dB
130 | 
131 | With filterbank optimizations enabled:
132 | bc7enc16:       38.78 secs 45.94 dB   48.12 dB
133 | 
134 | They both have virtually the same average RGB PSNR with these settings (.06 dB is basically noise), but 
135 | bc7enc16 is just as fast as ispc_texcomp fast, even though it's not vectorized. Interestingly, our Y PSNR is better, 
136 | although bc7enc16 wasn't using perceptual metrics in these benchmarks. 
137 | 
138 | This was a multithreaded benchmark (using OpenMP) on a dual Xeon workstation.
139 | ispc_texcomp was called with 64-blocks at a time and used AVX instructions.
140 | Timings are for encoding only.
141 | 


--------------------------------------------------------------------------------
/bc7decomp.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015 Harm Hanemaaijer <fgenfb@yahoo.com>
  3 | Permission to use, copy, modify, and/or distribute this software for any
  4 | purpose with or without fee is hereby granted, provided that the above
  5 | copyright notice and this permission notice appear in all copies.
  6 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  7 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  8 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  9 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 10 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 11 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 12 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 13 | */
 14 | 
 15 | // Modified by Rich Geldreich 4/26/18- fixed bugs in detexBlock128ExtractBits() and FullyDecodeEndpoints(),
 16 | // compared vs. DirectXTex'c BC7 decoder for correctness.
 17 | 
 18 | #include <stdlib.h>
 19 | #include <stdint.h>
 20 | #include <stdbool.h>
 21 | #include <memory.h>
 22 | #include "bc7decomp.h"
 23 | 
 24 | // Integer division using look-up tables, used by BC1/2/3 and RGTC (BC4/5)
 25 | // decompression.
 26 | 
 27 | typedef struct {
 28 | 	uint64_t data0;
 29 | 	uint64_t data1;
 30 | 	int index;
 31 | } detexBlock128;
 32 | 
 33 | uint32_t detexBlock128ExtractBits(detexBlock128 *block, int nu_bits) {
 34 | 	uint32_t value = 0;
 35 | 	for (int i = 0; i < nu_bits; i++) {
 36 | 		if (block->index < 64) {
 37 | 			int shift = block->index - i;
 38 | 			if (shift < 0)
 39 | 				value |= (block->data0 & ((uint64_t)1 << block->index)) << (-shift);
 40 | 			else
 41 | 				value |= (block->data0 & ((uint64_t)1 << block->index)) >> shift;
 42 | 		}
 43 | 		else {
 44 | 			int shift = ((block->index - 64) - i);
 45 | 			if (shift < 0)
 46 | 				value |= (block->data1 & ((uint64_t)1 << (block->index - 64))) << (-shift);
 47 | 			else
 48 | 				value |= (block->data1 & ((uint64_t)1 << (block->index - 64))) >> shift;
 49 | 		}
 50 | 		block->index++;
 51 | 	}
 52 | 	//	if (block->index > 128)
 53 | 	//		printf("Block overflow (%d)\n", block->index);
 54 | 	return value;
 55 | }
 56 | 
 57 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetR8(uint32_t pixel) {
 58 | 	return pixel & 0xFF;
 59 | }
 60 | 
 61 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetG8(uint32_t pixel) {
 62 | 	return (pixel & 0xFF00) >> 8;
 63 | }
 64 | 
 65 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetB8(uint32_t pixel) {
 66 | 	return (pixel & 0xFF0000) >> 16;
 67 | }
 68 | 
 69 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetA8(uint32_t pixel) {
 70 | 	return (pixel & 0xFF000000) >> 24;
 71 | }
 72 | 
 73 | static DETEX_INLINE_ONLY uint32_t detexPack32R8(int r) {
 74 | 	return (uint32_t)r;
 75 | }
 76 | 
 77 | static DETEX_INLINE_ONLY uint32_t detexPack32G8(int g) {
 78 | 	return (uint32_t)g << 8;
 79 | }
 80 | 
 81 | static DETEX_INLINE_ONLY uint32_t detexPack32B8(int b) {
 82 | 	return (uint32_t)b << 16;
 83 | }
 84 | 
 85 | static DETEX_INLINE_ONLY uint32_t detexPack32A8(int a) {
 86 | 	return (uint32_t)a << 24;
 87 | }
 88 | 
 89 | static DETEX_INLINE_ONLY uint32_t detexPack32RGBA8(int r, int g, int b, int a) {
 90 | 	return (uint32_t)r | ((uint32_t)g << 8) | ((uint32_t)b << 16) |
 91 | 		((uint32_t)a << 24);
 92 | }
 93 | 
 94 | uint32_t detexBlock128ExtractBits(detexBlock128 *block, int nu_bits);
 95 | 
 96 | /* Return bitfield from bit0 to bit1 from 64-bit bitstring. */
 97 | static DETEX_INLINE_ONLY uint32_t detexGetBits64(uint64_t data, int bit0, int bit1) {
 98 | 	uint64_t mask;
 99 | 	if (bit1 == 63)
100 | 		mask = UINT64_MAX;
101 | 	else
102 | 		mask = ((uint64_t)1 << (bit1 + 1)) - 1;
103 | 
104 | 	return (uint32_t)((data & mask) >> bit0);
105 | }
106 | 
107 | const uint8_t detex_bptc_table_P2[64 * 16] = {
108 | 	0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,
109 | 	0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,
110 | 	0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,
111 | 	0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,
112 | 	0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,
113 | 	0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,
114 | 	0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,
115 | 	0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
116 | 	0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,
117 | 	0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,
118 | 	0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,
119 | 	0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,
120 | 	0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,
121 | 	0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,
122 | 	0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,
123 | 	0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
124 | 	0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,
125 | 	0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,
126 | 	0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,
127 | 	0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,
128 | 	0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,
129 | 	0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,
130 | 	0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,
131 | 	0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
132 | 	0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,
133 | 	0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,
134 | 	0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,
135 | 	0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,
136 | 	0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,
137 | 	0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,
138 | 	0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,
139 | 	0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
140 | 	0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,
141 | 	0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,
142 | 	0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,
143 | 	0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,
144 | 	0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,
145 | 	0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,
146 | 	0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,
147 | 	0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
148 | 	0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,
149 | 	0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,
150 | 	0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,
151 | 	0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,
152 | 	0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,
153 | 	0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,
154 | 	0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,
155 | 	0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
156 | 	0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,
157 | 	0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,
158 | 	0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,
159 | 	0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,
160 | 	0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,
161 | 	0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,
162 | 	0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,
163 | 	0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
164 | 	0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,
165 | 	0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,
166 | 	0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,
167 | 	0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,
168 | 	0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,
169 | 	0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,
170 | 	0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,
171 | 	0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
172 | };
173 | 
174 | const uint8_t detex_bptc_table_P3[64 * 16] = {
175 | 	0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2,
176 | 	0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1,
177 | 	0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1,
178 | 	0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1,
179 | 	0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2,
180 | 	0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2,
181 | 	0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1,
182 | 	0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1,
183 | 	0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,
184 | 	0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
185 | 	0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2,
186 | 	0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,
187 | 	0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,
188 | 	0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2,
189 | 	0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2,
190 | 	0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0,
191 | 	0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2,
192 | 	0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0,
193 | 	0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2,
194 | 	0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1,
195 | 	0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2,
196 | 	0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1,
197 | 	0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2,
198 | 	0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0,
199 | 	0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0,
200 | 	0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2,
201 | 	0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0,
202 | 	0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1,
203 | 	0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2,
204 | 	0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2,
205 | 	0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1,
206 | 	0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1,
207 | 	0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2,
208 | 	0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1,
209 | 	0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2,
210 | 	0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0,
211 | 	0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0,
212 | 	0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0,
213 | 	0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0,
214 | 	0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1,
215 | 	0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1,
216 | 	0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2,
217 | 	0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1,
218 | 	0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2,
219 | 	0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1,
220 | 	0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1,
221 | 	0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1,
222 | 	0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1,
223 | 	0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2,
224 | 	0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1,
225 | 	0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2,
226 | 	0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2,
227 | 	0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2,
228 | 	0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2,
229 | 	0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2,
230 | 	0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2,
231 | 	0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2,
232 | 	0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2,
233 | 	0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2,
234 | 	0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,
235 | 	0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1,
236 | 	0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2,
237 | 	0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2,
238 | 	0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0,
239 | };
240 | 
241 | const uint8_t detex_bptc_table_anchor_index_second_subset[64] = {
242 | 	15,15,15,15,15,15,15,15,
243 | 	15,15,15,15,15,15,15,15,
244 | 	15, 2, 8, 2, 2, 8, 8,15,
245 | 	2, 8, 2, 2, 8, 8, 2, 2,
246 | 	15,15, 6, 8, 2, 8,15,15,
247 | 	2, 8, 2, 2, 2,15,15, 6,
248 | 	6, 2, 6, 8,15,15, 2, 2,
249 | 	15,15,15,15,15, 2, 2,15
250 | };
251 | 
252 | const uint8_t detex_bptc_table_anchor_index_second_subset_of_three[64] = {
253 | 	3, 3,15,15, 8, 3,15,15,
254 | 	8, 8, 6, 6, 6, 5, 3, 3,
255 | 	3, 3, 8,15, 3, 3, 6,10,
256 | 	5, 8, 8, 6, 8, 5,15,15,
257 | 	8,15, 3, 5, 6,10, 8,15,
258 | 	15, 3,15, 5,15,15,15,15,
259 | 	3,15, 5, 5, 5, 8, 5,10,
260 | 	5,10, 8,13,15,12, 3, 3
261 | };
262 | 
263 | const uint8_t detex_bptc_table_anchor_index_third_subset[64] = {
264 | 	15, 8, 8, 3,15,15, 3, 8,
265 | 	15,15,15,15,15,15,15, 8,
266 | 	15, 8,15, 3,15, 8,15, 8,
267 | 	3,15, 6,10,15,15,10, 8,
268 | 	15, 3,15,10,10, 8, 9,10,
269 | 	6,15, 8,15, 3, 6, 6, 8,
270 | 	15, 3,15,15,15,15,15,15,
271 | 	15,15,15,15, 3,15,15, 8
272 | };
273 | 
274 | const uint16_t detex_bptc_table_aWeight2[4] = {
275 | 	0, 21, 43, 64
276 | };
277 | 
278 | const uint16_t detex_bptc_table_aWeight3[8] = {
279 | 	0, 9, 18, 27, 37, 46, 55, 64
280 | };
281 | 
282 | const uint16_t detex_bptc_table_aWeight4[16] = {
283 | 	0, 4, 9, 13, 17, 21, 26, 30,
284 | 	34, 38, 43, 47, 51, 55, 60, 64
285 | };
286 | 
287 | 
288 | 
289 | // BPTC mode layout:
290 | //
291 | // Number of subsets = { 3, 2, 3, 2, 1, 1, 1, 2 };
292 | // Partition bits = { 4, 6, 6, 6, 0, 0, 0, 6 };
293 | // Rotation bits = { 0, 0, 0, 0, 2, 2, 0, 0 };
294 | // Mode 4 has one index selection bit.
295 | //
296 | //      #subsets color alpha before color   index after color	 index after	  After	     Index
297 | //                                                               alpha		  pbits	     bits (*)
298 | // Mode 0   3	  4	0    1 + 4 = 5			5 + 6 * 3 * 4 = 77	 77		  + 6 = 83   + 48 - 3 = 128
299 | // Mode 1   2	  6	0    2 + 6 = 8			8 + 4 * 3 * 6 = 80	 80		  + 2 = 82   + 48 - 2 = 128
300 | // Mode 2   3	  5	0    3 + 6 = 9			9 + 6 * 3 * 5 = 99	 99		  99	     + 32 - 3 = 128
301 | // Mode 3   2	  7	0    4 + 6 = 10	   10 + 4 * 3 * 7 = 94	 94		  + 4 = 98   + 32 - 2 = 128
302 | // Mode 4   1	  5	6    5 + 2 + 1 = 8	8 + 2 * 3 * 5 = 38	 37 + 2 * 6 = 50  50	     + 80 - 2 = 128
303 | // Mode 5   1	  7	8    6 + 2 = 8			8 + 2 * 3 * 7 = 50	 50 + 2 * 8 = 66  66	     + 64 - 2 = 128
304 | // Mode 6   1	  7	7    7					7 + 2 * 3 * 7 = 49	 49 + 2 * 7 = 63  + 2 = 65   + 64 - 1 = 128
305 | // Mode 7   2	  5	5    8 + 6 = 14     14 + 4 * 3 * 5 = 74	 74 + 4 * 5 = 94  + 4 = 98   + 32 - 2 = 128
306 | //
307 | // (*) For formats without alpha, the number of index bits is reduced by #subsets anchor bits.
308 | //     For formats with alpha, the number of index bits is reduced by 2 * #subsets by the anchor bits.
309 | 
310 | 
311 | static const uint8_t color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 };
312 | 
313 | // Note: precision includes P-bits!
314 | static const uint8_t color_precision_plus_pbit_table[8] = { 5, 7, 5, 8, 5, 7, 8, 6 };
315 | 
316 | static DETEX_INLINE_ONLY uint8_t GetColorComponentPrecision(int mode) {
317 | 	return color_precision_table[mode];
318 | }
319 | 
320 | static DETEX_INLINE_ONLY uint8_t GetColorComponentPrecisionPlusPbit(int mode) {
321 | 	return color_precision_plus_pbit_table[mode];
322 | }
323 | 
324 | static const int8_t alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 };
325 | 
326 | // Note: precision include P-bits!
327 | static const uint8_t alpha_precision_plus_pbit_table[8] = { 0, 0, 0, 0, 6, 8, 8, 6 };
328 | 
329 | static DETEX_INLINE_ONLY uint8_t GetAlphaComponentPrecision(int mode) {
330 | 	return alpha_precision_table[mode];
331 | }
332 | 
333 | static DETEX_INLINE_ONLY uint8_t GetAlphaComponentPrecisionPlusPbit(int mode) {
334 | 	return alpha_precision_plus_pbit_table[mode];
335 | }
336 | 
337 | static const int8_t components_in_qword0_table[8] = { 2, -1, 1, 1, 3, 3, 3, 2 };
338 | 
339 | /* Extract endpoint colors. */
340 | static void ExtractEndpoints(int mode, int nu_subsets, detexBlock128 * DETEX_RESTRICT block,
341 | 	uint8_t * DETEX_RESTRICT endpoint_array) {
342 | 	// Optimized version avoiding the use of block_extract_bits().
343 | 	int components_in_qword0 = components_in_qword0_table[mode];
344 | 	uint64_t data = block->data0 >> block->index;
345 | 	uint8_t precision = GetColorComponentPrecision(mode);
346 | 	uint8_t mask = (1 << precision) - 1;
347 | 	int total_bits_per_component = nu_subsets * 2 * precision;
348 | 	for (int i = 0; i < components_in_qword0; i++)	// For each color component.
349 | 		for (int j = 0; j < nu_subsets; j++)	// For each subset.
350 | 			for (int k = 0; k < 2; k++) {	// For each endpoint.
351 | 				endpoint_array[j * 8 + k * 4 + i] = data & mask;
352 | 				data >>= precision;
353 | 			}
354 | 	block->index += components_in_qword0 * total_bits_per_component;
355 | 	if (components_in_qword0 < 3) {
356 | 		// Handle the color component that crosses the boundary between data0 and data1
357 | 		data = block->data0 >> block->index;
358 | 		data |= block->data1 << (64 - block->index);
359 | 		int i = components_in_qword0;
360 | 		for (int j = 0; j < nu_subsets; j++)	// For each subset.
361 | 			for (int k = 0; k < 2; k++) {	// For each endpoint.
362 | 				endpoint_array[j * 8 + k * 4 + i] = data & mask;
363 | 				data >>= precision;
364 | 			}
365 | 		block->index += total_bits_per_component;
366 | 	}
367 | 	if (components_in_qword0 < 2) {
368 | 		// Handle the color component that is wholly in data1.
369 | 		data = block->data1 >> (block->index - 64);
370 | 		int i = 2;
371 | 		for (int j = 0; j < nu_subsets; j++)	// For each subset.
372 | 			for (int k = 0; k < 2; k++) {	// For each endpoint.
373 | 				endpoint_array[j * 8 + k * 4 + i] = data & mask;
374 | 				data >>= precision;
375 | 			}
376 | 		block->index += total_bits_per_component;
377 | 	}
378 | 	// Alpha component.
379 | 	if (GetAlphaComponentPrecision(mode) > 0) {
380 | 		// For mode 7, the alpha data is wholly in data1.
381 | 		// For modes 4 and 6, the alpha data is wholly in data0.
382 | 		// For mode 5, the alpha data is in data0 and data1.
383 | 		if (mode == 7)
384 | 			data = block->data1 >> (block->index - 64);
385 | 		else if (mode == 5)
386 | 			data = (block->data0 >> block->index) | ((block->data1 & 0x3) << 14);
387 | 		else
388 | 			data = block->data0 >> block->index;
389 | 		uint8_t alpha_precision = GetAlphaComponentPrecision(mode);
390 | 		uint8_t mask = (1 << alpha_precision) - 1;
391 | 		for (int j = 0; j < nu_subsets; j++)
392 | 			for (int k = 0; k < 2; k++) {	// For each endpoint.
393 | 				endpoint_array[j * 8 + k * 4 + 3] = data & mask;
394 | 				data >>= alpha_precision;
395 | 			}
396 | 		block->index += nu_subsets * 2 * alpha_precision;
397 | 	}
398 | }
399 | 
400 | static const uint8_t mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 };
401 | 
402 | static void FullyDecodeEndpoints(uint8_t * DETEX_RESTRICT endpoint_array, int nu_subsets,
403 | 	int mode, detexBlock128 * DETEX_RESTRICT block) {
404 | 	if (mode_has_p_bits[mode]) {
405 | 		// Mode 1 (shared P-bits) handled elsewhere.
406 | 		// Extract end-point P-bits.
407 | 		uint32_t bits;
408 | 		if (block->index < 64)
409 | 		{
410 | 			bits = (uint32_t)(block->data0 >> block->index);
411 | 			if ((block->index + nu_subsets * 2) > 64)
412 | 			{
413 | 				bits |= (block->data1 << (64 - block->index));
414 | 			}
415 | 		}
416 | 		else
417 | 			bits = (uint32_t)(block->data1 >> (block->index - 64));
418 | 		for (int i = 0; i < nu_subsets * 2; i++) {
419 | 			endpoint_array[i * 4 + 0] <<= 1;
420 | 			endpoint_array[i * 4 + 1] <<= 1;
421 | 			endpoint_array[i * 4 + 2] <<= 1;
422 | 			endpoint_array[i * 4 + 3] <<= 1;
423 | 			endpoint_array[i * 4 + 0] |= (bits & 1);
424 | 			endpoint_array[i * 4 + 1] |= (bits & 1);
425 | 			endpoint_array[i * 4 + 2] |= (bits & 1);
426 | 			endpoint_array[i * 4 + 3] |= (bits & 1);
427 | 			bits >>= 1;
428 | 		}
429 | 		block->index += nu_subsets * 2;
430 | 	}
431 | 	int color_prec = GetColorComponentPrecisionPlusPbit(mode);
432 | 	int alpha_prec = GetAlphaComponentPrecisionPlusPbit(mode);
433 | 	for (int i = 0; i < nu_subsets * 2; i++) {
434 | 		// Color_component_precision & alpha_component_precision includes pbit
435 | 		// left shift endpoint components so that their MSB lies in bit 7
436 | 		endpoint_array[i * 4 + 0] <<= (8 - color_prec);
437 | 		endpoint_array[i * 4 + 1] <<= (8 - color_prec);
438 | 		endpoint_array[i * 4 + 2] <<= (8 - color_prec);
439 | 		endpoint_array[i * 4 + 3] <<= (8 - alpha_prec);
440 | 
441 | 		// Replicate each component's MSB into the LSBs revealed by the left-shift operation above.
442 | 		endpoint_array[i * 4 + 0] |= (endpoint_array[i * 4 + 0] >> color_prec);
443 | 		endpoint_array[i * 4 + 1] |= (endpoint_array[i * 4 + 1] >> color_prec);
444 | 		endpoint_array[i * 4 + 2] |= (endpoint_array[i * 4 + 2] >> color_prec);
445 | 		endpoint_array[i * 4 + 3] |= (endpoint_array[i * 4 + 3] >> alpha_prec);
446 | 	}
447 | 	if (mode <= 3) {
448 | 		for (int i = 0; i < nu_subsets * 2; i++)
449 | 			endpoint_array[i * 4 + 3] = 0xFF;
450 | 	}
451 | }
452 | 
453 | static uint8_t Interpolate(uint8_t e0, uint8_t e1, uint8_t index, uint8_t indexprecision) {
454 | 	if (indexprecision == 2)
455 | 		return (uint8_t)(((64 - detex_bptc_table_aWeight2[index]) * (uint16_t)e0
456 | 			+ detex_bptc_table_aWeight2[index] * (uint16_t)e1 + 32) >> 6);
457 | 	else
458 | 		if (indexprecision == 3)
459 | 			return (uint8_t)(((64 - detex_bptc_table_aWeight3[index]) * (uint16_t)e0
460 | 				+ detex_bptc_table_aWeight3[index] * (uint16_t)e1 + 32) >> 6);
461 | 		else // indexprecision == 4
462 | 			return (uint8_t)(((64 - detex_bptc_table_aWeight4[index]) * (uint16_t)e0
463 | 				+ detex_bptc_table_aWeight4[index] * (uint16_t)e1 + 32) >> 6);
464 | }
465 | 
466 | static const uint8_t bptc_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
467 | 
468 | static DETEX_INLINE_ONLY int GetColorIndexBitcount(int mode, int index_selection_bit) {
469 | 	// If the index selection bit is set for mode 4, return 3, otherwise 2.
470 | 	return bptc_color_index_bitcount[mode] + index_selection_bit;
471 | }
472 | 
473 | static uint8_t bptc_alpha_index_bitcount[8] = { 3, 3, 2, 2, 3, 2, 4, 2 };
474 | 
475 | static DETEX_INLINE_ONLY int GetAlphaIndexBitcount(int mode, int index_selection_bit) {
476 | 	// If the index selection bit is set for mode 4, return 2, otherwise 3.
477 | 	return bptc_alpha_index_bitcount[mode] - index_selection_bit;
478 | }
479 | 
480 | static const uint8_t bptc_NS[8] = { 3, 2, 3, 2, 1, 1, 1, 2 };
481 | 
482 | static DETEX_INLINE_ONLY int GetNumberOfSubsets(int mode) {
483 | 	return bptc_NS[mode];
484 | }
485 | 
486 | static const uint8_t PB[8] = { 4, 6, 6, 6, 0, 0, 0, 6 };
487 | 
488 | static DETEX_INLINE_ONLY int GetNumberOfPartitionBits(int mode) {
489 | 	return PB[mode];
490 | }
491 | 
492 | static const uint8_t RB[8] = { 0, 0, 0, 0, 2, 2, 0, 0 };
493 | 
494 | static DETEX_INLINE_ONLY int GetNumberOfRotationBits(int mode) {
495 | 	return RB[mode];
496 | }
497 | 
498 | // Functions to extract parameters. */
499 | 
500 | static int ExtractMode(detexBlock128 *block) {
501 | 	for (int i = 0; i < 8; i++)
502 | 		if (block->data0 & ((uint64_t)1 << i)) {
503 | 			block->index = i + 1;
504 | 			return i;
505 | 		}
506 | 	// Illegal.
507 | 	return -1;
508 | }
509 | 
510 | static DETEX_INLINE_ONLY int ExtractPartitionSetID(detexBlock128 *block, int mode) {
511 | 	return detexBlock128ExtractBits(block, GetNumberOfPartitionBits(mode));
512 | }
513 | 
514 | static DETEX_INLINE_ONLY int GetPartitionIndex(int nu_subsets, int partition_set_id, int i) {
515 | 	if (nu_subsets == 1)
516 | 		return 0;
517 | 	if (nu_subsets == 2)
518 | 		return detex_bptc_table_P2[partition_set_id * 16 + i];
519 | 	return detex_bptc_table_P3[partition_set_id * 16 + i];
520 | }
521 | 
522 | static DETEX_INLINE_ONLY int ExtractRotationBits(detexBlock128 *block, int mode) {
523 | 	return detexBlock128ExtractBits(block, GetNumberOfRotationBits(mode));
524 | }
525 | 
526 | static DETEX_INLINE_ONLY int GetAnchorIndex(int partition_set_id, int partition, int nu_subsets) {
527 | 	if (partition == 0)
528 | 		return 0;
529 | 	if (nu_subsets == 2)
530 | 		return detex_bptc_table_anchor_index_second_subset[partition_set_id];
531 | 	if (partition == 1)
532 | 		return detex_bptc_table_anchor_index_second_subset_of_three[partition_set_id];
533 | 	return detex_bptc_table_anchor_index_third_subset[partition_set_id];
534 | }
535 | 
536 | static const uint8_t IB[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
537 | static const uint8_t IB2[8] = { 0, 0, 0, 0, 3, 2, 0, 0 };
538 | static const uint8_t mode_has_partition_bits[8] = { 1, 1, 1, 1, 0, 0, 0, 1 };
539 | 
540 | /* Decompress a 128-bit 4x4 pixel texture block compressed using BPTC mode 1. */
541 | 
542 | static bool DecompressBlockBPTCMode1(detexBlock128 * DETEX_RESTRICT block,
543 | 	uint8_t * DETEX_RESTRICT pixel_buffer) {
544 | 	uint64_t data0 = block->data0;
545 | 	uint64_t data1 = block->data1;
546 | 	int partition_set_id = detexGetBits64(data0, 2, 7);
547 | 	uint8_t endpoint[2 * 2 * 3];	// 2 subsets.
548 | 	endpoint[0] = detexGetBits64(data0, 8, 13);	// red, subset 0, endpoint 0
549 | 	endpoint[3] = detexGetBits64(data0, 14, 19);	// red, subset 0, endpoint 1
550 | 	endpoint[6] = detexGetBits64(data0, 20, 25);	// red, subset 1, endpoint 0
551 | 	endpoint[9] = detexGetBits64(data0, 26, 31);	// red, subset 1, endpoint 1
552 | 	endpoint[1] = detexGetBits64(data0, 32, 37);	// green, subset 0, endpoint 0
553 | 	endpoint[4] = detexGetBits64(data0, 38, 43);	// green, subset 0, endpoint 1
554 | 	endpoint[7] = detexGetBits64(data0, 44, 49);	// green, subset 1, endpoint 0
555 | 	endpoint[10] = detexGetBits64(data0, 50, 55);	// green, subset 1, endpoint 1
556 | 	endpoint[2] = detexGetBits64(data0, 56, 61);	// blue, subset 0, endpoint 0
557 | 	endpoint[5] = detexGetBits64(data0, 62, 63)	// blue, subset 0, endpoint 1
558 | 		| (detexGetBits64(data1, 0, 3) << 2);
559 | 	endpoint[8] = detexGetBits64(data1, 4, 9);	// blue, subset 1, endpoint 0
560 | 	endpoint[11] = detexGetBits64(data1, 10, 15);	// blue, subset 1, endpoint 1
561 | 																	// Decode endpoints.
562 | 	for (int i = 0; i < 2 * 2; i++) {
563 | 		//component-wise left-shift
564 | 		endpoint[i * 3 + 0] <<= 2;
565 | 		endpoint[i * 3 + 1] <<= 2;
566 | 		endpoint[i * 3 + 2] <<= 2;
567 | 	}
568 | 	// P-bit is shared.
569 | 	uint8_t pbit_zero = detexGetBits64(data1, 16, 16) << 1;
570 | 	uint8_t pbit_one = detexGetBits64(data1, 17, 17) << 1;
571 | 	// RGB only pbits for mode 1, one for each subset.
572 | 	for (int j = 0; j < 3; j++) {
573 | 		endpoint[0 * 3 + j] |= pbit_zero;
574 | 		endpoint[1 * 3 + j] |= pbit_zero;
575 | 		endpoint[2 * 3 + j] |= pbit_one;
576 | 		endpoint[3 * 3 + j] |= pbit_one;
577 | 	}
578 | 	for (int i = 0; i < 2 * 2; i++) {
579 | 		// Replicate each component's MSB into the LSB.
580 | 		endpoint[i * 3 + 0] |= endpoint[i * 3 + 0] >> 7;
581 | 		endpoint[i * 3 + 1] |= endpoint[i * 3 + 1] >> 7;
582 | 		endpoint[i * 3 + 2] |= endpoint[i * 3 + 2] >> 7;
583 | 	}
584 | 
585 | 	uint8_t subset_index[16];
586 | 	for (int i = 0; i < 16; i++)
587 | 		// subset_index[i] is a number from 0 to 1.
588 | 		subset_index[i] = detex_bptc_table_P2[partition_set_id * 16 + i];
589 | 	uint8_t anchor_index[2];
590 | 	anchor_index[0] = 0;
591 | 	anchor_index[1] = detex_bptc_table_anchor_index_second_subset[partition_set_id];
592 | 	uint8_t color_index[16];
593 | 	// Extract primary index bits.
594 | 	data1 >>= 18;
595 | 	for (int i = 0; i < 16; i++)
596 | 		if (i == anchor_index[subset_index[i]]) {
597 | 			// Highest bit is zero.
598 | 			color_index[i] = data1 & 3; // Get two bits.
599 | 			data1 >>= 2;
600 | 		}
601 | 		else {
602 | 			color_index[i] = data1 & 7;	// Get three bits.
603 | 			data1 >>= 3;
604 | 		}
605 | 
606 | 	uint32_t *pixel32_buffer = (uint32_t *)pixel_buffer;
607 | 	for (int i = 0; i < 16; i++) {
608 | 		uint8_t endpoint_start[3];
609 | 		uint8_t endpoint_end[3];
610 | 		for (int j = 0; j < 3; j++) {
611 | 			endpoint_start[j] = endpoint[2 * subset_index[i] * 3 + j];
612 | 			endpoint_end[j] = endpoint[(2 * subset_index[i] + 1) * 3 + j];
613 | 		}
614 | 		uint32_t output;
615 | 		output = detexPack32R8(Interpolate(endpoint_start[0], endpoint_end[0], color_index[i], 3));
616 | 		output |= detexPack32G8(Interpolate(endpoint_start[1], endpoint_end[1], color_index[i], 3));
617 | 		output |= detexPack32B8(Interpolate(endpoint_start[2], endpoint_end[2], color_index[i], 3));
618 | 		output |= detexPack32A8(0xFF);
619 | 		pixel32_buffer[i] = output;
620 | 	}
621 | 	return true;
622 | }
623 | 
624 | /* Decompress a 128-bit 4x4 pixel texture block compressed using the BPTC */
625 | /* (BC7) format. */
626 | bool detexDecompressBlockBPTC(const uint8_t * DETEX_RESTRICT bitstring, uint32_t mode_mask,
627 | 	uint32_t flags, uint8_t * DETEX_RESTRICT pixel_buffer) {
628 | 	detexBlock128 block;
629 | 	block.data0 = *(uint64_t *)&bitstring[0];
630 | 	block.data1 = *(uint64_t *)&bitstring[8];
631 | 	block.index = 0;
632 | 	int mode = ExtractMode(&block);
633 | 	if (mode == -1)
634 | 		return 0;
635 | 	// Allow compression tied to specific modes (according to mode_mask).
636 | 	if (!(mode_mask & ((int)1 << mode)))
637 | 		return 0;
638 | 	if (mode >= 4 && (flags & DETEX_DECOMPRESS_FLAG_OPAQUE_ONLY))
639 | 		return 0;
640 | 	if (mode < 4 && (flags & DETEX_DECOMPRESS_FLAG_NON_OPAQUE_ONLY))
641 | 		return 0;
642 | 	if (mode == 1)
643 | 		return DecompressBlockBPTCMode1(&block, pixel_buffer);
644 | 
645 | 	int nu_subsets = 1;
646 | 	int partition_set_id = 0;
647 | 	if (mode_has_partition_bits[mode]) {
648 | 		nu_subsets = GetNumberOfSubsets(mode);
649 | 		partition_set_id = ExtractPartitionSetID(&block, mode);
650 | 	}
651 | 	int rotation = ExtractRotationBits(&block, mode);
652 | 	int index_selection_bit = 0;
653 | 	if (mode == 4)
654 | 		index_selection_bit = detexBlock128ExtractBits(&block, 1);
655 | 
656 | 	int alpha_index_bitcount = GetAlphaIndexBitcount(mode, index_selection_bit);
657 | 	int color_index_bitcount = GetColorIndexBitcount(mode, index_selection_bit);
658 | 
659 | 	uint8_t endpoint_array[3 * 2 * 4];	// Max. 3 subsets.
660 | 	ExtractEndpoints(mode, nu_subsets, &block, endpoint_array);
661 | 	FullyDecodeEndpoints(endpoint_array, nu_subsets, mode, &block);
662 | 
663 | 	uint8_t subset_index[16];
664 | 	for (int i = 0; i < 16; i++)
665 | 		// subset_index[i] is a number from 0 to 2, or 0 to 1, or 0 depending on the number of subsets.
666 | 		subset_index[i] = GetPartitionIndex(nu_subsets, partition_set_id, i);
667 | 	uint8_t anchor_index[4] = { 0, 0, 0, 0 };	// Only need max. 3 elements.
668 | 	for (int i = 0; i < nu_subsets; i++)
669 | 		anchor_index[i] = GetAnchorIndex(partition_set_id, i, nu_subsets);
670 | 	uint8_t color_index[16];
671 | 	uint8_t alpha_index[16];
672 | 	memset(color_index, 0, sizeof(color_index));
673 | 	memset(alpha_index, 0, sizeof(alpha_index));
674 | 	// Extract primary index bits.
675 | 	uint64_t data1;
676 | 	if (block.index >= 64) {
677 | 		// Because the index bits are all in the second 64-bit word, there is no need to use
678 | 		// block_extract_bits().
679 | 		// This implies the mode is not 4.
680 | 		data1 = block.data1 >> (block.index - 64);
681 | 		uint8_t mask1 = (1 << IB[mode]) - 1;
682 | 		uint8_t mask2 = (1 << (IB[mode] - 1)) - 1;
683 | 		for (int i = 0; i < 16; i++)
684 | 			if (i == anchor_index[subset_index[i]]) {
685 | 				// Highest bit is zero.
686 | 				color_index[i] = data1 & mask2;
687 | 				data1 >>= IB[mode] - 1;
688 | 				alpha_index[i] = color_index[i];
689 | 			}
690 | 			else {
691 | 				color_index[i] = data1 & mask1;
692 | 				data1 >>= IB[mode];
693 | 				alpha_index[i] = color_index[i];
694 | 			}
695 | 	}
696 | 	else {	// Implies mode 4.
697 | 				// Because the bits cross the 64-bit word boundary, we have to be careful.
698 | 				// Block index is 50 at this point.
699 | 		uint64_t data = block.data0 >> 50;
700 | 		data |= block.data1 << 14;
701 | 		for (int i = 0; i < 16; i++)
702 | 		if (i == anchor_index[subset_index[i]]) {
703 | 			// Highest bit is zero.
704 | 			if (index_selection_bit) {	// Implies mode == 4.
705 | 				alpha_index[i] = data & 0x1;
706 | 				data >>= 1;
707 | 			}
708 | 			else {
709 | 				color_index[i] = data & 0x1;
710 | 				data >>= 1;
711 | 			}
712 | 		}
713 | 		else {
714 | 			if (index_selection_bit) {	// Implies mode == 4.
715 | 				alpha_index[i] = data & 0x3;
716 | 				data >>= 2;
717 | 			}
718 | 			else {
719 | 				color_index[i] = data & 0x3;
720 | 				data >>= 2;
721 | 			}
722 | 		}
723 | 		// Block index is 81 at this point.
724 | 		data1 = block.data1 >> (81 - 64);
725 | 	}
726 | 	// Extract secondary index bits.
727 | 	if (IB2[mode] > 0) {
728 | 		uint8_t mask1 = (1 << IB2[mode]) - 1;
729 | 		uint8_t mask2 = (1 << (IB2[mode] - 1)) - 1;
730 | 		for (int i = 0; i < 16; i++)
731 | 			if (i == anchor_index[subset_index[i]]) {
732 | 				// Highest bit is zero.
733 | 				if (index_selection_bit) {
734 | 					color_index[i] = data1 & 0x3;
735 | 					data1 >>= 2;
736 | 				}
737 | 				else {
738 | 					//					alpha_index[i] = block_extract_bits(&block, IB2[mode] - 1);
739 | 					alpha_index[i] = data1 & mask2;
740 | 					data1 >>= IB2[mode] - 1;
741 | 				}
742 | 			}
743 | 			else {
744 | 				if (index_selection_bit) {
745 | 					color_index[i] = data1 & 0x7;
746 | 					data1 >>= 3;
747 | 				}
748 | 				else {
749 | 					//					alpha_index[i] = block_extract_bits(&block, IB2[mode]);
750 | 					alpha_index[i] = data1 & mask1;
751 | 					data1 >>= IB2[mode];
752 | 				}
753 | 			}
754 | 	}
755 | 
756 | 	uint32_t *pixel32_buffer = (uint32_t *)pixel_buffer;
757 | 	for (int i = 0; i < 16; i++) {
758 | 		uint8_t endpoint_start[4];
759 | 		uint8_t endpoint_end[4];
760 | 		for (int j = 0; j < 4; j++) {
761 | 			endpoint_start[j] = endpoint_array[2 * subset_index[i] * 4 + j];
762 | 			endpoint_end[j] = endpoint_array[(2 * subset_index[i] + 1) * 4 + j];
763 | 		}
764 | 
765 | 		uint32_t output = 0;
766 | 		output = detexPack32R8(Interpolate(endpoint_start[0], endpoint_end[0], color_index[i], color_index_bitcount));
767 | 		output |= detexPack32G8(Interpolate(endpoint_start[1], endpoint_end[1], color_index[i], color_index_bitcount));
768 | 		output |= detexPack32B8(Interpolate(endpoint_start[2], endpoint_end[2], color_index[i], color_index_bitcount));
769 | 		output |= detexPack32A8(Interpolate(endpoint_start[3], endpoint_end[3], alpha_index[i], alpha_index_bitcount));
770 | 
771 | 		if (rotation > 0) {
772 | 			if (rotation == 1)
773 | 				output = detexPack32RGBA8(detexPixel32GetA8(output), detexPixel32GetG8(output),
774 | 					detexPixel32GetB8(output), detexPixel32GetR8(output));
775 | 			else
776 | 				if (rotation == 2)
777 | 					output = detexPack32RGBA8(detexPixel32GetR8(output), detexPixel32GetA8(output),
778 | 						detexPixel32GetB8(output), detexPixel32GetG8(output));
779 | 				else // rotation == 3
780 | 					output = detexPack32RGBA8(detexPixel32GetR8(output), detexPixel32GetG8(output),
781 | 						detexPixel32GetA8(output), detexPixel32GetB8(output));
782 | 		}
783 | 		pixel32_buffer[i] = output;
784 | 	}
785 | 	return true;
786 | }
787 | 
788 | /* Return the internal mode of the BPTC block. */
789 | uint32_t detexGetModeBPTC(const uint8_t *bitstring) {
790 | 	detexBlock128 block;
791 | 	block.data0 = *(uint64_t *)&bitstring[0];
792 | 	block.data1 = *(uint64_t *)&bitstring[8];
793 | 	block.index = 0;
794 | 	int mode = ExtractMode(&block);
795 | 	return mode;
796 | }
797 | 
798 | void detexSetModeBPTC(uint8_t *bitstring, uint32_t mode, uint32_t flags,
799 | 	uint32_t *colors) {
800 | 	// Mode 0 starts with 1
801 | 	// Mode 1 starts with 01
802 | 	// ...
803 | 	// Mode 7 starts with 00000001
804 | 	int bit = 0x1 << mode;
805 | 	bitstring[0] &= ~(bit - 1);
806 | 	bitstring[0] |= bit;
807 | 	return;
808 | }
809 | 
810 | 


--------------------------------------------------------------------------------
/bc7decomp.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef _MSC_VER
 4 | #define DETEX_INLINE_ONLY __forceinline
 5 | #define DETEX_RESTRICT __restrict
 6 | #else
 7 | #define DETEX_INLINE_ONLY
 8 | #define DETEX_RESTRICT
 9 | #endif
10 | 
11 | enum {
12 | 	/* Function returns false (invalid block) when the compressed block */
13 | 	/* is in a format not allowed to be generated by an encoder. */
14 | 	DETEX_DECOMPRESS_FLAG_ENCODE = 0x1,
15 | 	/* For compression formats that have opaque and non-opaque modes, */
16 | 	/* return false (invalid block) when the compressed block is encoded */
17 | 	/* using a non-opaque mode. */
18 | 	DETEX_DECOMPRESS_FLAG_OPAQUE_ONLY = 0x2,
19 | 	/* For compression formats that have opaque and non-opaque modes, */
20 | 	/* return false (invalid block) when the compressed block is encoded */
21 | 	/* using an opaque mode. */
22 | 	DETEX_DECOMPRESS_FLAG_NON_OPAQUE_ONLY = 0x4,
23 | }; 
24 | 
25 | #ifdef __cplusplus
26 | extern "C" {
27 | #endif
28 | 
29 | bool detexDecompressBlockBPTC(const uint8_t * DETEX_RESTRICT bitstring, uint32_t mode_mask,
30 | 	uint32_t flags, uint8_t * DETEX_RESTRICT pixel_buffer);
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 


--------------------------------------------------------------------------------
/bc7enc.cpp:
--------------------------------------------------------------------------------
  1 | // bc7enc.cpp - bc7enc17.c command line example/test app
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <math.h>
  6 | #include <algorithm>
  7 | #include <assert.h>
  8 | #include <time.h>
  9 | 
 10 | #include "bc7enc16.h"
 11 | #include "lodepng.h"
 12 | #include "dds_defs.h"
 13 | #include "ktx_defs.h"
 14 | #include "bc7decomp.h"
 15 | 
 16 | template <typename T> inline T clamp(T v, T l, T h) { if (v < l) v = l; else if (v > h) v = h; return v; }
 17 | inline int iabs(int i) { if (i < 0) i = -i; return i; }
 18 | 
 19 | static int print_usage()
 20 | {
 21 | 	fprintf(stderr, "bc7enc\n");
 22 | 	fprintf(stderr, "Reads PNG files (with or without alpha channels) and packs them to BC7/BPTC using modes 1 and 6.\n");
 23 | 	fprintf(stderr, "This tool works best with opaque images, or on images with relatively simple alpha channels.\n");
 24 | 	fprintf(stderr, "By default, a DX10 DDS file and a unpacked PNG file will be written to the source file's directory with the .dds/_unpacked.png/_unpacked_alpha.png suffixes.\n\n");
 25 | 	fprintf(stderr, "Usage: bc7enc [-apng_filename] [-l] [-uX] [-aX] [-g] [-y] input_filename.png [compressed_output.dds] [unpacked_output.png]\n");
 26 | 	fprintf(stderr, "-apng_filename Load G channel of PNG file into alpha channel of source image\n");
 27 | 	fprintf(stderr, "-l Use linear colorspace metrics instead of perceptual\n");
 28 | 	fprintf(stderr, "-uX Higher quality levels, X ranges from [0,4], higher=slower\n");
 29 | 	fprintf(stderr, "-pX Scan X partitions in mode 1, X ranges from [0,64], use 0 to disable mode 1 entirely (faster)\n");
 30 | 	fprintf(stderr, "-g Don't write an unpacked output PNG file\n");
 31 | 	fprintf(stderr, "-y Flip source image along Y axis before packing\n");
 32 | 	fprintf(stderr, "-k Generate .ktx file instead of .dds file\n");
 33 | 
 34 | 	return EXIT_FAILURE;
 35 | }
 36 | 
 37 | struct color_quad_u8
 38 | {
 39 | 	uint8_t m_c[4];
 40 | 
 41 | 	inline color_quad_u8(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 42 | 	{
 43 | 		set(r, g, b, a);
 44 | 	}
 45 | 
 46 | 	inline color_quad_u8(uint8_t y = 0, uint8_t a = 255)
 47 | 	{
 48 | 		set(y, a);
 49 | 	}
 50 | 
 51 | 	inline color_quad_u8 &set(uint8_t y, uint8_t a = 255)
 52 | 	{
 53 | 		m_c[0] = y;
 54 | 		m_c[1] = y;
 55 | 		m_c[2] = y;
 56 | 		m_c[3] = a;
 57 | 		return *this;
 58 | 	}
 59 | 
 60 | 	inline color_quad_u8 &set(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 61 | 	{
 62 | 		m_c[0] = r;
 63 | 		m_c[1] = g;
 64 | 		m_c[2] = b;
 65 | 		m_c[3] = a;
 66 | 		return *this;
 67 | 	}
 68 | 
 69 | 	inline uint8_t &operator[] (uint32_t i) { assert(i < 4);  return m_c[i]; }
 70 | 	inline uint8_t operator[] (uint32_t i) const { assert(i < 4); return m_c[i]; }
 71 | 
 72 | 	inline int get_luma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; } // REC709 weightings
 73 | };
 74 | typedef std::vector<color_quad_u8> color_quad_u8_vec;
 75 | 
 76 | class image_u8
 77 | {
 78 | public:
 79 | 	image_u8() :
 80 | 		m_width(0), m_height(0)
 81 | 	{
 82 | 	}
 83 | 
 84 | 	image_u8(uint32_t width, uint32_t height) :
 85 | 		m_width(width), m_height(height)
 86 | 	{
 87 | 		m_pixels.resize(width * height);
 88 | 	}
 89 | 
 90 | 	inline const color_quad_u8_vec &get_pixels() const { return m_pixels; }
 91 | 	inline color_quad_u8_vec &get_pixels() { return m_pixels; }
 92 | 
 93 | 	inline uint32_t width() const { return m_width; }
 94 | 	inline uint32_t height() const { return m_height; }
 95 | 	inline uint32_t total_pixels() const { return m_width * m_height; }
 96 | 
 97 | 	inline color_quad_u8 &operator()(uint32_t x, uint32_t y) { assert(x < m_width && y < m_height);  return m_pixels[x + m_width * y]; }
 98 | 	inline const color_quad_u8 &operator()(uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height);  return m_pixels[x + m_width * y]; }
 99 | 
100 | 	image_u8& clear()
101 | 	{
102 | 		m_width = m_height = 0;
103 | 		m_pixels.clear();
104 | 		return *this;
105 | 	}
106 | 
107 | 	image_u8& init(uint32_t width, uint32_t height)
108 | 	{
109 | 		clear();
110 | 
111 | 		m_width = width;
112 | 		m_height = height;
113 | 		m_pixels.resize(width * height);
114 | 		return *this;
115 | 	}
116 | 
117 | 	image_u8& set_all(const color_quad_u8 &p)
118 | 	{
119 | 		for (uint32_t i = 0; i < m_pixels.size(); i++)
120 | 			m_pixels[i] = p;
121 | 		return *this;
122 | 	}
123 | 
124 | 	image_u8& crop(uint32_t new_width, uint32_t new_height)
125 | 	{
126 | 		if ((m_width == new_width) && (m_height == new_height))
127 | 			return *this;
128 | 
129 | 		image_u8 new_image(new_width, new_height);
130 | 
131 | 		const uint32_t w = std::min(m_width, new_width);
132 | 		const uint32_t h = std::min(m_height, new_height);
133 | 
134 | 		for (uint32_t y = 0; y < h; y++)
135 | 			for (uint32_t x = 0; x < w; x++)
136 | 				new_image(x, y) = (*this)(x, y);
137 | 
138 | 		return swap(new_image);
139 | 	}
140 | 
141 | 	image_u8 &swap(image_u8 &other)
142 | 	{
143 | 		std::swap(m_width, other.m_width);
144 | 		std::swap(m_height, other.m_height);
145 | 		std::swap(m_pixels, other.m_pixels);
146 | 		return *this;
147 | 	}
148 | 
149 | 	inline void get_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8 *pPixels)
150 | 	{
151 | 		assert((bx * width + width) <= m_width);
152 | 		assert((by * height + height) <= m_height);
153 | 
154 | 		for (uint32_t y = 0; y < height; y++)
155 | 			memcpy(pPixels + y * width, &(*this)(bx * width, by * height + y), width * sizeof(color_quad_u8));
156 | 	}
157 | 
158 | 	inline void set_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, const color_quad_u8 *pPixels)
159 | 	{
160 | 		assert((bx * width + width) <= m_width);
161 | 		assert((by * height + height) <= m_height);
162 | 
163 | 		for (uint32_t y = 0; y < height; y++)
164 | 			memcpy(&(*this)(bx * width, by * height + y), pPixels + y * width, width * sizeof(color_quad_u8));
165 | 	}
166 | 
167 | 	image_u8 &swizzle(uint32_t r, uint32_t g, uint32_t b, uint32_t a)
168 | 	{
169 | 		assert((r | g | b | a) <= 3);
170 | 		for (uint32_t y = 0; y < m_height; y++)
171 | 		{
172 | 			for (uint32_t x = 0; x < m_width; x++)
173 | 			{
174 | 				color_quad_u8 tmp((*this)(x, y));
175 | 				(*this)(x, y).set(tmp[r], tmp[g], tmp[b], tmp[a]);
176 | 			}
177 | 		}
178 | 
179 | 		return *this;
180 | 	}
181 | 
182 | private:
183 | 	color_quad_u8_vec m_pixels;
184 | 	uint32_t m_width, m_height;
185 | };
186 | 
187 | static bool load_png(const char *pFilename, image_u8 &img)
188 | {
189 | 	img.clear();
190 | 
191 | 	std::vector<unsigned char> pixels;
192 | 	unsigned int w = 0, h = 0;
193 | 	unsigned int e = lodepng::decode(pixels, w, h, pFilename);
194 | 	if (e != 0)
195 | 	{
196 | 		fprintf(stderr, "Failed loading PNG file %s\n", pFilename);
197 | 		return false;
198 | 	}
199 | 
200 | 	img.init(w, h);
201 | 	memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t));
202 | 
203 | 	return true;
204 | }
205 | 
206 | static bool save_png(const char *pFilename, const image_u8 &img, bool save_alpha)
207 | {
208 | 	const uint32_t w = img.width();
209 | 	const uint32_t h = img.height();
210 | 
211 | 	std::vector<unsigned char> pixels;
212 | 	if (save_alpha)
213 | 	{
214 | 		pixels.resize(w * h * sizeof(color_quad_u8));
215 | 		memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8));
216 | 	}
217 | 	else
218 | 	{
219 | 		pixels.resize(w * h * 3);
220 | 		unsigned char *pDst = &pixels[0];
221 | 		for (uint32_t y = 0; y < h; y++)
222 | 			for (uint32_t x = 0; x < w; x++, pDst += 3)
223 | 				pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2];
224 | 	}
225 | 
226 | 	return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0;
227 | }
228 | 
229 | class image_metrics
230 | {
231 | public:
232 | 	double m_max, m_mean, m_mean_squared, m_root_mean_squared, m_peak_snr;
233 | 
234 | 	image_metrics()
235 | 	{
236 | 		clear();
237 | 	}
238 | 
239 | 	void clear()
240 | 	{
241 | 		memset(this, 0, sizeof(*this));
242 | 	}
243 | 
244 | 	void compute(const image_u8 &a, const image_u8 &b, uint32_t first_channel, uint32_t num_channels)
245 | 	{
246 | 		const bool average_component_error = true;
247 | 
248 | 		const uint32_t width = std::min(a.width(), b.width());
249 | 		const uint32_t height = std::min(a.height(), b.height());
250 | 
251 | 		assert((first_channel < 4U) && (first_channel + num_channels <= 4U));
252 | 
253 | 		// Histogram approach originally due to Charles Bloom.
254 | 		double hist[256];
255 | 		memset(hist, 0, sizeof(hist));
256 | 
257 | 		for (uint32_t y = 0; y < height; y++)
258 | 		{
259 | 			for (uint32_t x = 0; x < width; x++)
260 | 			{
261 | 				const color_quad_u8 &ca = a(x, y);
262 | 				const color_quad_u8 &cb = b(x, y);
263 | 
264 | 				if (!num_channels)
265 | 					hist[iabs(ca.get_luma() - cb.get_luma())]++;
266 | 				else
267 | 				{
268 | 					for (uint32_t c = 0; c < num_channels; c++)
269 | 						hist[iabs(ca[first_channel + c] - cb[first_channel + c])]++;
270 | 				}
271 | 			}
272 | 		}
273 | 
274 | 		m_max = 0;
275 | 		double sum = 0.0f, sum2 = 0.0f;
276 | 		for (uint32_t i = 0; i < 256; i++)
277 | 		{
278 | 			if (!hist[i])
279 | 				continue;
280 | 
281 | 			m_max = std::max<double>(m_max, i);
282 | 
283 | 			double x = i * hist[i];
284 | 
285 | 			sum += x;
286 | 			sum2 += i * x;
287 | 		}
288 | 
289 | 		// See http://richg42.blogspot.com/2016/09/how-to-compute-psnr-from-old-berkeley.html
290 | 		double total_values = width * height;
291 | 
292 | 		if (average_component_error)
293 | 			total_values *= clamp<uint32_t>(num_channels, 1, 4);
294 | 
295 | 		m_mean = clamp<double>(sum / total_values, 0.0f, 255.0f);
296 | 		m_mean_squared = clamp<double>(sum2 / total_values, 0.0f, 255.0f * 255.0f);
297 | 
298 | 		m_root_mean_squared = sqrt(m_mean_squared);
299 | 
300 | 		if (!m_root_mean_squared)
301 | 			m_peak_snr = 1e+10f;
302 | 		else
303 | 			m_peak_snr = clamp<double>(log10(255.0f / m_root_mean_squared) * 20.0f, 0.0f, 500.0f);
304 | 	}
305 | };
306 | 
307 | struct bc7_block
308 | {
309 | 	uint64_t m_vals[2];
310 | };
311 | 
312 | typedef std::vector<bc7_block> bc7_block_vec;
313 | 
314 | static bool save_bc7_dds(const char *pFilename, uint32_t width, uint32_t height, const bc7_block *pBlocks, bool srgb)
315 | {
316 | 	(void)srgb;
317 | 
318 | 	FILE *pFile = NULL;
319 | 	pFile = fopen(pFilename, "wb");
320 | 	if (!pFile)
321 | 	{
322 | 		fprintf(stderr, "Failed creating file %s!\n", pFilename);
323 | 		return false;
324 | 	}
325 | 
326 | 	fwrite("DDS ", 4, 1, pFile);
327 | 
328 | 	DDSURFACEDESC2 desc;
329 | 	memset(&desc, 0, sizeof(desc));
330 | 
331 | 	desc.dwSize = sizeof(desc);
332 | 	desc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CAPS;
333 | 
334 | 	desc.dwWidth = width;
335 | 	desc.dwHeight = height;
336 | 
337 | 	desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE;
338 | 	desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat);
339 | 
340 | 	desc.ddpfPixelFormat.dwFlags |= DDPF_FOURCC;
341 | 
342 | 	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0');
343 | 	desc.ddpfPixelFormat.dwRGBBitCount = 0;
344 | 
345 | 	const uint32_t pixel_format_bpp = 8;
346 | 	desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3;
347 | 	desc.dwFlags |= DDSD_LINEARSIZE;
348 | 
349 | 	fwrite(&desc, sizeof(desc), 1, pFile);
350 | 
351 | 	DDS_HEADER_DXT10 hdr10;
352 | 	memset(&hdr10, 0, sizeof(hdr10));
353 | 
354 | 	// Not all tools support DXGI_FORMAT_BC7_UNORM_SRGB (like NVTT), but ddsview in DirectXTex pays attention to it. So not sure what to do here.
355 | 	// For best compatibility just write DXGI_FORMAT_BC7_UNORM.
356 | 	//hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM;
357 | 	hdr10.dxgiFormat = DXGI_FORMAT_BC7_UNORM;
358 | 	hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
359 | 	hdr10.arraySize = 1;
360 | 
361 | 	fwrite(&hdr10, sizeof(hdr10), 1, pFile);
362 | 
363 | 	fwrite(pBlocks, desc.lPitch, 1, pFile);
364 | 
365 | 	if (fclose(pFile) == EOF)
366 | 	{
367 | 		fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename);
368 | 		return false;
369 | 	}
370 | 
371 | 	return true;
372 | }
373 | 
374 | static bool save_bc7_ktx(const char *pFilename,
375 | 			 uint32_t width, uint32_t height,
376 | 			 const bc7_block *pBlocks, bool srgb, bool has_alpha)
377 | {
378 | 	(void) has_alpha;      // RGB without A is currently unsupported
379 | 	FILE *pFile = NULL;
380 | 	pFile = fopen(pFilename, "wb");
381 | 	if (!pFile)
382 | 	{
383 | 		fprintf(stderr, "Failed creating file %s!\n", pFilename);
384 | 		return false;
385 | 	}
386 | 
387 | 	uint32_t keyValueSizeBrutto = 0;
388 | 
389 |         //
390 | 	// key/value pair length computation
391 |         //
392 | 
393 | 	// first key/value pair
394 | 	uint32_t keyValueKtxOrientationSizeNetto = sizeof(ktxOrientation);
395 | 	uint32_t keyValueKtxOrientationSizeBrutto = (keyValueKtxOrientationSizeNetto + 3) & ~3;
396 | 	keyValueSizeBrutto += keyValueKtxOrientationSizeBrutto + 4 /* 4 is the size of the size field */;
397 | 	// additional pairs
398 |         // ...
399 | 
400 | 	//
401 | 	// header
402 | 	//
403 | 
404 | 	struct KTX_HEADER header;
405 | 	memcpy(header.identifier, ktxFileIdentifier, sizeof(header.identifier));
406 | 	header.endianness = ktxEndianess;
407 | 	header.glType = 0;             // 0: compressed texture
408 | 	header.glTypeSize = 1;         // 1: endianess independent, especially for compressed textures
409 | 	header.glFormat = 0;           // 0: compressed texture
410 | 	header.glInternalFormat = (srgb) ? GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM : GL_COMPRESSED_RGBA_BPTC_UNORM;  // see table 8.14 and chapter C.2 of OpenGL 4.4. specification
411 | //	header.glBaseInternalFormat = (has_alpha) ? GL_RGBA : GL_RGB;   // see table 8.11
412 | 	header.glBaseInternalFormat = GL_RGBA;   // see table 8.11
413 | 	header.pixelWidth = width;
414 | 	header.pixelHeight = height;
415 | 	header.pixelDepth = 0;
416 | 	header.numberOfArrayElements = 0;   // numberOfArrayElements denotes the number of array elements which is the number of elements in the array or the size of the array measured in elements.
417 | 	header.numberOfFaces = 1;           // cubemap/s/arrays: 6, otherwise: 1
418 | 	header.numberOfMipmapLevels = 1;    // non-mipmapped: 1
419 | 	header.bytesOfKeyValueData = keyValueSizeBrutto;
420 | 	fwrite(&header, sizeof(header), 1, pFile);
421 | 
422 | 	//
423 | 	// key/value pairs
424 | 	//
425 | 
426 | 	const uint8_t pad[3] = {0 , 0 , 0};
427 | 
428 | 	// first key/value pair
429 | 	fwrite(&keyValueKtxOrientationSizeNetto, sizeof(keyValueKtxOrientationSizeNetto), 1, pFile);
430 | 	fwrite(ktxOrientation, sizeof(ktxOrientation), 1, pFile);
431 | 	uint32_t numOfPad = keyValueKtxOrientationSizeBrutto - keyValueKtxOrientationSizeNetto;
432 | 	fwrite(pad, 1, numOfPad, pFile);
433 | 	// additional pairs
434 |         // ...
435 | 
436 | 	//
437 | 	// image size
438 | 	//
439 | 
440 | 	const uint32_t pixel_format_bpp = 8;   // 8 bits per pixel (compressed to a quarter of that needed by RGBA)
441 | 	const uint32_t imageSize = (((width + 3) & ~3) * ((height + 3) & ~3) * pixel_format_bpp) >> 3;
442 | 	fwrite(&imageSize, sizeof(imageSize), 1, pFile);
443 | 
444 | 	//
445 | 	// image data
446 | 	//
447 | 
448 | 	fwrite(pBlocks, imageSize, 1, pFile);
449 | 
450 | 	if (fclose(pFile) == EOF)
451 | 	{
452 | 		fprintf(stderr, "Failed writing to KTX file %s!\n", pFilename);
453 | 		return false;
454 | 	}
455 | 
456 | 	return true;
457 | }
458 | 
459 | static void strip_extension(std::string &s)
460 | {
461 | 	for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--)
462 | 	{
463 | 		if (s[i] == '.')
464 | 		{
465 | 			s.resize(i);
466 | 			break;
467 | 		}
468 | 	}
469 | }
470 | 
471 | int main(int argc, char *argv[])
472 | {
473 | 	if (argc < 2)
474 | 		return print_usage();
475 | 
476 | 	std::string src_filename;
477 | 	std::string src_alpha_filename;
478 | 	std::string dds_output_filename;
479 | 	std::string png_output_filename;
480 | 	std::string png_alpha_output_filename;
481 | 	int uber_level = 0;
482 | 	int max_partitions_to_scan = BC7ENC16_MAX_PARTITIONS1;
483 | 	bool perceptual = true;
484 | 	bool no_output_png = false;
485 | 	bool y_flip = false;
486 | 	bool ktx = false;
487 | 
488 | 	for (int i = 1; i < argc; i++)
489 | 	{
490 | 		const char *pArg = argv[i];
491 | 		if (pArg[0] == '-')
492 | 		{
493 | 			switch (pArg[1])
494 | 			{
495 | 				case 'y':
496 | 				{
497 | 					y_flip = true;
498 | 					break;
499 | 				}
500 | 				case 'a':
501 | 				{
502 | 					src_alpha_filename = pArg + 2;
503 | 					break;
504 | 				}
505 | 				case 'u':
506 | 				{
507 | 					uber_level = atoi(pArg + 2);
508 | 					if ((uber_level < 0) || (uber_level > BC7ENC16_MAX_UBER_LEVEL))
509 | 					{
510 | 						fprintf(stderr, "Invalid argument: %s\n", pArg);
511 | 						return EXIT_FAILURE;
512 | 					}
513 | 					break;
514 | 
515 | 				}
516 | 				case 'g':
517 | 				{
518 | 					no_output_png = true;
519 | 					break;
520 | 				}
521 | 				case 'l':
522 | 				{
523 | 					perceptual = false;
524 | 					break;
525 | 				}
526 | 				case 'p':
527 | 				{
528 | 					max_partitions_to_scan = atoi(pArg + 2);
529 | 					if ((max_partitions_to_scan < 0) || (max_partitions_to_scan > BC7ENC16_MAX_PARTITIONS1))
530 | 					{
531 | 						fprintf(stderr, "Invalid argument: %s\n", pArg);
532 | 						return EXIT_FAILURE;
533 | 					}
534 | 					break;
535 | 				}
536 | 				case 'k':
537 | 				{
538 | 					ktx = true;
539 | 					break;
540 | 				}
541 | 				default:
542 | 				{
543 | 					fprintf(stderr, "Invalid argument: %s\n", pArg);
544 | 					return EXIT_FAILURE;
545 | 				}
546 | 			}
547 | 		}
548 | 		else
549 | 		{
550 | 			if (!src_filename.size())
551 | 				src_filename = pArg;
552 | 			else if (!dds_output_filename.size())
553 | 				dds_output_filename = pArg;
554 | 			else if (!png_output_filename.size())
555 | 				png_output_filename = pArg;
556 | 			else
557 | 			{
558 | 				fprintf(stderr, "Invalid argument: %s\n", pArg);
559 | 				return EXIT_FAILURE;
560 | 			}
561 | 		}
562 | 	}
563 | 
564 | 	if (!src_filename.size())
565 | 	{
566 | 		fprintf(stderr, "No source filename specified!\n");
567 | 		return EXIT_FAILURE;
568 | 	}
569 | 
570 | 	if (!dds_output_filename.size())
571 | 	{
572 | 		dds_output_filename = src_filename;
573 | 		strip_extension(dds_output_filename);
574 | 		dds_output_filename += (ktx) ? ktxFileNameExt : ".dds";
575 | 	}
576 | 
577 | 	if (!png_output_filename.size())
578 | 	{
579 | 		png_output_filename = src_filename;
580 | 		strip_extension(png_output_filename);
581 | 		png_output_filename += "_unpacked.png";
582 | 	}
583 | 
584 | 	png_alpha_output_filename = png_output_filename;
585 | 	strip_extension(png_alpha_output_filename);
586 | 	png_alpha_output_filename += "_unpacked_alpha.png";
587 | 
588 | 	image_u8 source_image;
589 | 	if (!load_png(src_filename.c_str(), source_image))
590 | 		return EXIT_FAILURE;
591 | 
592 | 	printf("Source image: %s %ux%u\n", src_filename.c_str(), source_image.width(), source_image.height());
593 | 
594 | 	if (src_alpha_filename.size())
595 | 	{
596 | 		image_u8 source_alpha_image;
597 | 		if (!load_png(src_alpha_filename.c_str(), source_alpha_image))
598 | 			return EXIT_FAILURE;
599 | 
600 | 		printf("Source alpha image: %s %ux%u\n", src_alpha_filename.c_str(), source_alpha_image.width(), source_alpha_image.height());
601 | 
602 | 		const uint32_t w = std::min(source_alpha_image.width(), source_image.width());
603 | 		const uint32_t h = std::min(source_alpha_image.height(), source_image.height());
604 | 
605 | 		for (uint32_t y = 0; y < h; y++)
606 | 			for (uint32_t x = 0; x < w; x++)
607 | 				source_image(x, y)[3] = source_alpha_image(x, y)[1];
608 | 	}
609 | 
610 | 	const uint32_t orig_width = source_image.width();
611 | 	const uint32_t orig_height = source_image.height();
612 | 
613 | 	if (y_flip)
614 | 	{
615 | 		image_u8 temp;
616 | 		temp.init(orig_width, orig_height);
617 | 
618 | 		for (uint32_t y = 0; y < orig_height; y++)
619 | 			for (uint32_t x = 0; x < orig_width; x++)
620 | 				temp(x, (orig_height - 1) - y) = source_image(x, y);
621 | 
622 | 		temp.swap(source_image);
623 | 	}
624 | 
625 | 	source_image.crop((source_image.width() + 3) & ~3, (source_image.height() + 3) & ~3);
626 | 
627 | 	const uint32_t blocks_x = source_image.width() / 4;
628 | 	const uint32_t blocks_y = source_image.height() / 4;
629 | 
630 | 	bc7_block_vec packed_image(blocks_x * blocks_y);
631 | 
632 | 	bc7enc16_compress_block_params pack_params;
633 | 	bc7enc16_compress_block_params_init(&pack_params);
634 | 	if (!perceptual)
635 | 		bc7enc16_compress_block_params_init_linear_weights(&pack_params);
636 | 	pack_params.m_max_partitions_mode1 = max_partitions_to_scan;
637 | 	pack_params.m_uber_level = uber_level;
638 | 
639 | 	printf("Max mode 1 partitions: %u, uber level: %u, perceptual: %u\n", pack_params.m_max_partitions_mode1, pack_params.m_uber_level, perceptual);
640 | 
641 | 	bc7enc16_compress_block_init();
642 | 
643 | 	bool has_alpha = false;
644 | 
645 | 	clock_t start_t = clock();
646 | 	for (uint32_t by = 0; by < blocks_y; by++)
647 | 	{
648 | 		for (uint32_t bx = 0; bx < blocks_x; bx++)
649 | 		{
650 | 			color_quad_u8 pixels[16];
651 | 
652 | 			source_image.get_block(bx, by, 4, 4, pixels);
653 | 
654 | 			bc7_block *pBlock = &packed_image[bx + by * blocks_x];
655 | 
656 | 			if (bc7enc16_compress_block(pBlock, pixels, &pack_params))
657 | 				has_alpha = true;
658 | 		}
659 | 
660 | 		if ((by & 63) == 0)
661 | 		{
662 | 			printf(".");
663 | 			fflush(stdout);
664 | 		}
665 | 	}
666 | 
667 | 	clock_t end_t = clock();
668 | 
669 | 	printf("\nTotal time: %f secs\n", (double)(end_t - start_t) / CLOCKS_PER_SEC);
670 | 
671 | 	if (has_alpha)
672 | 		printf("Source image had an alpha channel.\n");
673 | 
674 | 	bool failed = false;
675 | 	if (ktx)
676 | 	{
677 | 		if (!save_bc7_ktx(dds_output_filename.c_str(), orig_width, orig_height, &packed_image[0], perceptual, has_alpha))
678 | 			failed = true;
679 | 		else
680 | 			printf("Wrote KTX file %s\n", dds_output_filename.c_str());
681 | 	}
682 | 	else
683 | 	{
684 | 		if (!save_bc7_dds(dds_output_filename.c_str(), orig_width, orig_height, &packed_image[0], perceptual))
685 | 			failed = true;
686 | 		else
687 | 			printf("Wrote DDS file %s\n", dds_output_filename.c_str());
688 | 	}
689 | 
690 | 	if ((!no_output_png) && (png_output_filename.size()))
691 | 	{
692 | 		image_u8 unpacked_image(source_image.width(), source_image.height());
693 | 
694 | 		for (uint32_t by = 0; by < blocks_y; by++)
695 | 		{
696 | 			for (uint32_t bx = 0; bx < blocks_x; bx++)
697 | 			{
698 | 				bc7_block *pBlock = &packed_image[bx + by * blocks_x];
699 | 
700 | 				color_quad_u8 unpacked_pixels[16];
701 | 				detexDecompressBlockBPTC((const uint8_t *)pBlock, UINT32_MAX, 0, (uint8_t *)unpacked_pixels);
702 | 
703 | 				unpacked_image.set_block(bx, by, 4, 4, unpacked_pixels);
704 | 			}
705 | 		}
706 | 
707 | 		image_metrics y_metrics;
708 | 		y_metrics.compute(source_image, unpacked_image, 0, 0);
709 | 		printf("Luma  Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", y_metrics.m_max, y_metrics.m_root_mean_squared, y_metrics.m_peak_snr);
710 | 
711 | 		image_metrics rgb_metrics;
712 | 		rgb_metrics.compute(source_image, unpacked_image, 0, 3);
713 | 		printf("RGB   Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", rgb_metrics.m_max, rgb_metrics.m_root_mean_squared, rgb_metrics.m_peak_snr);
714 | 
715 | 		image_metrics rgba_metrics;
716 | 		rgba_metrics.compute(source_image, unpacked_image, 0, 4);
717 | 		printf("RGBA  Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", rgba_metrics.m_max, rgba_metrics.m_root_mean_squared, rgba_metrics.m_peak_snr);
718 | 
719 | 		image_metrics a_metrics;
720 | 		a_metrics.compute(source_image, unpacked_image, 3, 1);
721 | 		printf("Alpha Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", a_metrics.m_max, a_metrics.m_root_mean_squared, a_metrics.m_peak_snr);
722 | 
723 | 		if (!save_png(png_output_filename.c_str(), unpacked_image, false))
724 | 			failed = true;
725 | 		else
726 | 			printf("Wrote PNG file %s\n", png_output_filename.c_str());
727 | 
728 | 		//if ((png_alpha_output_filename.size()) && (has_alpha))
729 | 		if (png_alpha_output_filename.size())
730 | 		{
731 | 			image_u8 unpacked_image_alpha(unpacked_image);
732 | 			for (uint32_t y = 0; y < unpacked_image_alpha.height(); y++)
733 | 				for (uint32_t x = 0; x < unpacked_image_alpha.width(); x++)
734 | 					unpacked_image_alpha(x, y).set(unpacked_image_alpha(x, y)[3], 255);
735 | 
736 | 			if (!save_png(png_alpha_output_filename.c_str(), unpacked_image_alpha, false))
737 | 				failed = true;
738 | 			else
739 | 				printf("Wrote PNG file %s\n", png_alpha_output_filename.c_str());
740 | 		}
741 | 	}
742 | 
743 | 	return failed ? EXIT_FAILURE : EXIT_SUCCESS;
744 | }
745 | 


--------------------------------------------------------------------------------
/bc7enc16.c:
--------------------------------------------------------------------------------
   1 | // File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file)
   2 | #include "bc7enc16.h"
   3 | #include <math.h>
   4 | #include <memory.h>
   5 | #include <assert.h>
   6 | #include <limits.h>
   7 | 
   8 | // Helpers
   9 | static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
  10 | static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
  11 | static inline float saturate(float value) { return clampf(value, 0, 1.0f); }
  12 | static inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; }
  13 | static inline uint32_t minimumu(uint32_t a, uint32_t b) { return (a < b) ? a : b; }
  14 | static inline float minimumf(float a, float b) { return (a < b) ? a : b; }
  15 | static inline uint8_t maximumub(uint8_t a, uint8_t b) { return (a > b) ? a : b; }
  16 | static inline uint32_t maximumu(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
  17 | static inline float maximumf(float a, float b) { return (a > b) ? a : b; }
  18 | static inline int squarei(int i) { return i * i; }
  19 | static inline float squaref(float i) { return i * i; }
  20 | 
  21 | typedef struct { uint8_t m_c[4]; } color_quad_u8;
  22 | typedef struct { float m_c[4]; } vec4F;
  23 | 
  24 | static inline color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->m_c[0] = (uint8_t)clampi(r, 0, 255); pRes->m_c[1] = (uint8_t)clampi(g, 0, 255); pRes->m_c[2] = (uint8_t)clampi(b, 0, 255); pRes->m_c[3] = (uint8_t)clampi(a, 0, 255); return pRes; }
  25 | static inline color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->m_c[0] = (uint8_t)r; pRes->m_c[1] = (uint8_t)g; pRes->m_c[2] = (uint8_t)b; pRes->m_c[3] = (uint8_t)a; return pRes; }
  26 | static inline bc7enc16_bool color_quad_u8_notequals(const color_quad_u8 *pLHS, const color_quad_u8 *pRHS) { return (pLHS->m_c[0] != pRHS->m_c[0]) || (pLHS->m_c[1] != pRHS->m_c[1]) || (pLHS->m_c[2] != pRHS->m_c[2]) || (pLHS->m_c[3] != pRHS->m_c[3]); }
  27 | static inline vec4F *vec4F_set_scalar(vec4F *pV, float x) {	pV->m_c[0] = x; pV->m_c[1] = x; pV->m_c[2] = x;	pV->m_c[3] = x;	return pV; }
  28 | static inline vec4F *vec4F_set(vec4F *pV, float x, float y, float z, float w) {	pV->m_c[0] = x;	pV->m_c[1] = y;	pV->m_c[2] = z;	pV->m_c[3] = w;	return pV; }
  29 | static inline vec4F *vec4F_saturate_in_place(vec4F *pV) { pV->m_c[0] = saturate(pV->m_c[0]); pV->m_c[1] = saturate(pV->m_c[1]); pV->m_c[2] = saturate(pV->m_c[2]); pV->m_c[3] = saturate(pV->m_c[3]); return pV; }
  30 | static inline vec4F vec4F_saturate(const vec4F *pV) { vec4F res; res.m_c[0] = saturate(pV->m_c[0]); res.m_c[1] = saturate(pV->m_c[1]); res.m_c[2] = saturate(pV->m_c[2]); res.m_c[3] = saturate(pV->m_c[3]); return res; }
  31 | static inline vec4F vec4F_from_color(const color_quad_u8 *pC) { vec4F res; vec4F_set(&res, pC->m_c[0], pC->m_c[1], pC->m_c[2], pC->m_c[3]); return res; }
  32 | static inline vec4F vec4F_add(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] + pRHS->m_c[0], pLHS->m_c[1] + pRHS->m_c[1], pLHS->m_c[2] + pRHS->m_c[2], pLHS->m_c[3] + pRHS->m_c[3]); return res; }
  33 | static inline vec4F vec4F_sub(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] - pRHS->m_c[0], pLHS->m_c[1] - pRHS->m_c[1], pLHS->m_c[2] - pRHS->m_c[2], pLHS->m_c[3] - pRHS->m_c[3]); return res; }
  34 | static inline float vec4F_dot(const vec4F *pLHS, const vec4F *pRHS) { return pLHS->m_c[0] * pRHS->m_c[0] + pLHS->m_c[1] * pRHS->m_c[1] + pLHS->m_c[2] * pRHS->m_c[2] + pLHS->m_c[3] * pRHS->m_c[3]; }
  35 | static inline vec4F vec4F_mul(const vec4F *pLHS, float s) { vec4F res; vec4F_set(&res, pLHS->m_c[0] * s, pLHS->m_c[1] * s, pLHS->m_c[2] * s, pLHS->m_c[3] * s); return res; }
  36 | static inline vec4F *vec4F_normalize_in_place(vec4F *pV) { float s = pV->m_c[0] * pV->m_c[0] + pV->m_c[1] * pV->m_c[1] + pV->m_c[2] * pV->m_c[2] + pV->m_c[3] * pV->m_c[3]; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->m_c[0] *= s; pV->m_c[1] *= s; pV->m_c[2] *= s; pV->m_c[3] *= s; } return pV; }
  37 | 
  38 | // Various BC7 tables
  39 | static const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
  40 | static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
  41 | // Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
  42 | static const float g_bc7_weights3x[8 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f,
  43 | 	0.079102f, 0.718750f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
  44 | static const float g_bc7_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f,
  45 | 	0.451416f, 0.328125f, 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
  46 | 	0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
  47 | static const uint8_t g_bc7_partition1[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
  48 | static const uint8_t g_bc7_partition2[64 * 16] =
  49 | {
  50 | 	0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,		0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,		0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,		0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,		0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,		0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
  51 | 	0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,		0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,		0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
  52 | 	0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,		0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,		0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,		0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,		0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,		0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,		0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,		0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
  53 | 	0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,		0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,		0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,		0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,		0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,		0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,		0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,		0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
  54 | 	0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,		0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,		0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,		0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,		0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,		0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,		0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,		0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
  55 | 	0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,		0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,		0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,		0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,		0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,		0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,		0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,		0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
  56 | 	0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,		0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,		0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,		0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,		0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,		0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,		0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
  57 | 	0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,		0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,		0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,		0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,		0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,		0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,		0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,		0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
  58 | };
  59 | static const uint8_t g_bc7_table_anchor_index_second_subset[64] = {	15,15,15,15,15,15,15,15,		15,15,15,15,15,15,15,15,		15, 2, 8, 2, 2, 8, 8,15,		2, 8, 2, 2, 8, 8, 2, 2,		15,15, 6, 8, 2, 8,15,15,		2, 8, 2, 2, 2,15,15, 6,		6, 2, 6, 8,15,15, 2, 2,		15,15,15,15,15, 2, 2,15 };
  60 | static const uint8_t g_bc7_num_subsets[8] = { 3, 2, 3, 2, 1, 1, 1, 2 };
  61 | static const uint8_t g_bc7_partition_bits[8] = { 4, 6, 6, 6, 0, 0, 0, 6 };
  62 | static const uint8_t g_bc7_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
  63 | static int get_bc7_color_index_size(int mode, int index_selection_bit) { return g_bc7_color_index_bitcount[mode] + index_selection_bit; }
  64 | static const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 };
  65 | static const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 };
  66 | static const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 };
  67 | static const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 };
  68 | 
  69 | typedef struct { uint16_t m_error; uint8_t m_lo; uint8_t m_hi; } endpoint_err;
  70 | 
  71 | static endpoint_err g_bc7_mode_1_optimal_endpoints[256][2]; // [c][pbit]
  72 | static const uint32_t BC7ENC16_MODE_1_OPTIMAL_INDEX = 2;
  73 | 
  74 | // Initialize the lookup table used for optimal single color compression in mode 1. Must be called before encoding.
  75 | void bc7enc16_compress_block_init()
  76 | {
  77 | 	for (int c = 0; c < 256; c++)
  78 | 	{
  79 | 		for (uint32_t lp = 0; lp < 2; lp++)
  80 | 		{
  81 | 			endpoint_err best;
  82 | 			best.m_error = (uint16_t)UINT16_MAX;
  83 | 			for (uint32_t l = 0; l < 64; l++)
  84 | 			{
  85 | 				uint32_t low = ((l << 1) | lp) << 1;
  86 | 				low |= (low >> 7);
  87 | 				for (uint32_t h = 0; h < 64; h++)
  88 | 				{
  89 | 					uint32_t high = ((h << 1) | lp) << 1;
  90 | 					high |= (high >> 7);
  91 | 					const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6;
  92 | 					const int err = (k - c) * (k - c);
  93 | 					if (err < best.m_error)
  94 | 					{
  95 | 						best.m_error = (uint16_t)err;
  96 | 						best.m_lo = (uint8_t)l;
  97 | 						best.m_hi = (uint8_t)h;
  98 | 					}
  99 | 				} // h
 100 | 			} // l
 101 | 			g_bc7_mode_1_optimal_endpoints[c][lp] = best;
 102 | 		} // lp
 103 | 	} // c
 104 | }
 105 | 
 106 | static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors)
 107 | {
 108 | 	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
 109 | 	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
 110 | 	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
 111 | 	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
 112 | 	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
 113 | 	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
 114 | 	float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f;
 115 | 	for (uint32_t i = 0; i < N; i++)
 116 | 	{
 117 | 		const uint32_t sel = pSelectors[i];
 118 | 		z00 += pSelector_weights[sel].m_c[0];
 119 | 		z10 += pSelector_weights[sel].m_c[1];
 120 | 		z11 += pSelector_weights[sel].m_c[2];
 121 | 		float w = pSelector_weights[sel].m_c[3];
 122 | 		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
 123 | 		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
 124 | 		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
 125 | 		q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3];
 126 | 	}
 127 | 
 128 | 	q10_r = t_r - q00_r;
 129 | 	q10_g = t_g - q00_g;
 130 | 	q10_b = t_b - q00_b;
 131 | 	q10_a = t_a - q00_a;
 132 | 
 133 | 	z01 = z10;
 134 | 
 135 | 	float det = z00 * z11 - z01 * z10;
 136 | 	if (det != 0.0f)
 137 | 		det = 1.0f / det;
 138 | 
 139 | 	float iz00, iz01, iz10, iz11;
 140 | 	iz00 = z11 * det;
 141 | 	iz01 = -z01 * det;
 142 | 	iz10 = -z10 * det;
 143 | 	iz11 = z00 * det;
 144 | 
 145 | 	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
 146 | 	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
 147 | 	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
 148 | 	pXl->m_c[3] = (float)(iz00 * q00_a + iz01 * q10_a); pXh->m_c[3] = (float)(iz10 * q00_a + iz11 * q10_a);
 149 | }
 150 | 
 151 | static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors)
 152 | {
 153 | 	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
 154 | 	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
 155 | 	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
 156 | 	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
 157 | 	for (uint32_t i = 0; i < N; i++)
 158 | 	{
 159 | 		const uint32_t sel = pSelectors[i];
 160 | 		z00 += pSelector_weights[sel].m_c[0];
 161 | 		z10 += pSelector_weights[sel].m_c[1];
 162 | 		z11 += pSelector_weights[sel].m_c[2];
 163 | 		float w = pSelector_weights[sel].m_c[3];
 164 | 		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
 165 | 		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
 166 | 		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
 167 | 	}
 168 | 
 169 | 	q10_r = t_r - q00_r;
 170 | 	q10_g = t_g - q00_g;
 171 | 	q10_b = t_b - q00_b;
 172 | 
 173 | 	z01 = z10;
 174 | 
 175 | 	float det = z00 * z11 - z01 * z10;
 176 | 	if (det != 0.0f)
 177 | 		det = 1.0f / det;
 178 | 
 179 | 	float iz00, iz01, iz10, iz11;
 180 | 	iz00 = z11 * det;
 181 | 	iz01 = -z01 * det;
 182 | 	iz10 = -z10 * det;
 183 | 	iz11 = z00 * det;
 184 | 
 185 | 	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
 186 | 	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
 187 | 	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
 188 | 	pXl->m_c[3] = 255.0f; pXh->m_c[3] = 255.0f;
 189 | }
 190 | 
 191 | typedef struct
 192 | {
 193 | 	uint32_t m_num_pixels;
 194 | 	const color_quad_u8 *m_pPixels;
 195 | 	uint32_t m_num_selector_weights;
 196 | 	const uint32_t *m_pSelector_weights;
 197 | 	const vec4F *m_pSelector_weightsx;
 198 | 	uint32_t m_comp_bits;
 199 | 	uint32_t m_weights[4];
 200 | 	bc7enc16_bool m_has_alpha;
 201 | 	bc7enc16_bool m_has_pbits;
 202 | 	bc7enc16_bool m_endpoints_share_pbit;
 203 | 	bc7enc16_bool m_perceptual;
 204 | } color_cell_compressor_params;
 205 | 
 206 | typedef struct
 207 | {
 208 | 	uint64_t m_best_overall_err;
 209 | 	color_quad_u8 m_low_endpoint;
 210 | 	color_quad_u8 m_high_endpoint;
 211 | 	uint32_t m_pbits[2];
 212 | 	uint8_t *m_pSelectors;
 213 | 	uint8_t *m_pSelectors_temp;
 214 | } color_cell_compressor_results;
 215 | 
 216 | static inline color_quad_u8 scale_color(const color_quad_u8 *pC, const color_cell_compressor_params *pParams)
 217 | {
 218 | 	color_quad_u8 results;
 219 | 
 220 | 	const uint32_t n = pParams->m_comp_bits + (pParams->m_has_pbits ? 1 : 0);
 221 | 	assert((n >= 4) && (n <= 8));
 222 | 
 223 | 	for (uint32_t i = 0; i < 4; i++)
 224 | 	{
 225 | 		uint32_t v = pC->m_c[i] << (8 - n);
 226 | 		v |= (v >> n);
 227 | 		assert(v <= 255);
 228 | 		results.m_c[i] = (uint8_t)(v);
 229 | 	}
 230 | 
 231 | 	return results;
 232 | }
 233 | 
 234 | static inline uint64_t compute_color_distance_rgb(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc16_bool perceptual, const uint32_t weights[4])
 235 | {
 236 | 	int dr, dg, db;
 237 | 
 238 | 	if (perceptual)
 239 | 	{
 240 | 		const int l1 = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37;
 241 | 		const int cr1 = ((int)pE1->m_c[0] << 9) - l1;
 242 | 		const int cb1 = ((int)pE1->m_c[2] << 9) - l1;
 243 | 		const int l2 = pE2->m_c[0] * 109 + pE2->m_c[1] * 366 + pE2->m_c[2] * 37;
 244 | 		const int cr2 = ((int)pE2->m_c[0] << 9) - l2;
 245 | 		const int cb2 = ((int)pE2->m_c[2] << 9) - l2;
 246 | 		dr = (l1 - l2) >> 8;
 247 | 		dg = (cr1 - cr2) >> 8;
 248 | 		db = (cb1 - cb2) >> 8;
 249 | 	}
 250 | 	else
 251 | 	{
 252 | 		dr = (int)pE1->m_c[0] - (int)pE2->m_c[0];
 253 | 		dg = (int)pE1->m_c[1] - (int)pE2->m_c[1];
 254 | 		db = (int)pE1->m_c[2] - (int)pE2->m_c[2];
 255 | 	}
 256 | 
 257 | 	return weights[0] * (uint32_t)(dr * dr) + weights[1] * (uint32_t)(dg * dg) + weights[2] * (uint32_t)(db * db);
 258 | }
 259 | 
 260 | static inline uint64_t compute_color_distance_rgba(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc16_bool perceptual, const uint32_t weights[4])
 261 | {
 262 | 	int da = (int)pE1->m_c[3] - (int)pE2->m_c[3];
 263 | 	return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * (uint32_t)(da * da));
 264 | }
 265 | 
 266 | static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
 267 | {
 268 | 	uint32_t best_err = UINT_MAX;
 269 | 	uint32_t best_p = 0;
 270 | 
 271 | 	for (uint32_t p = 0; p < 2; p++)
 272 | 	{
 273 | 		uint32_t err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error;
 274 | 		if (err < best_err)
 275 | 		{
 276 | 			best_err = err;
 277 | 			best_p = p;
 278 | 		}
 279 | 	}
 280 | 
 281 | 	const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p];
 282 | 	const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p];
 283 | 	const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p];
 284 | 
 285 | 	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
 286 | 	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
 287 | 	pResults->m_pbits[0] = best_p;
 288 | 	pResults->m_pbits[1] = 0;
 289 | 
 290 | 	memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams->m_num_pixels);
 291 | 
 292 | 	color_quad_u8 p;
 293 | 	for (uint32_t i = 0; i < 3; i++)
 294 | 	{
 295 | 		uint32_t low = ((pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
 296 | 		low |= (low >> 7);
 297 | 
 298 | 		uint32_t high = ((pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
 299 | 		high |= (high >> 7);
 300 | 
 301 | 		p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6);
 302 | 	}
 303 | 	p.m_c[3] = 255;
 304 | 
 305 | 	uint64_t total_err = 0;
 306 | 	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 307 | 		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
 308 | 
 309 | 	pResults->m_best_overall_err = total_err;
 310 | 
 311 | 	return total_err;
 312 | }
 313 | 
 314 | static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
 315 | {
 316 | 	color_quad_u8 quantMinColor = *pLow;
 317 | 	color_quad_u8 quantMaxColor = *pHigh;
 318 | 
 319 | 	if (pParams->m_has_pbits)
 320 | 	{
 321 | 		uint32_t minPBit, maxPBit;
 322 | 
 323 | 		if (pParams->m_endpoints_share_pbit)
 324 | 			maxPBit = minPBit = pbits[0];
 325 | 		else
 326 | 		{
 327 | 			minPBit = pbits[0];
 328 | 			maxPBit = pbits[1];
 329 | 		}
 330 | 
 331 | 		quantMinColor.m_c[0] = (uint8_t)((pLow->m_c[0] << 1) | minPBit);
 332 | 		quantMinColor.m_c[1] = (uint8_t)((pLow->m_c[1] << 1) | minPBit);
 333 | 		quantMinColor.m_c[2] = (uint8_t)((pLow->m_c[2] << 1) | minPBit);
 334 | 		quantMinColor.m_c[3] = (uint8_t)((pLow->m_c[3] << 1) | minPBit);
 335 | 
 336 | 		quantMaxColor.m_c[0] = (uint8_t)((pHigh->m_c[0] << 1) | maxPBit);
 337 | 		quantMaxColor.m_c[1] = (uint8_t)((pHigh->m_c[1] << 1) | maxPBit);
 338 | 		quantMaxColor.m_c[2] = (uint8_t)((pHigh->m_c[2] << 1) | maxPBit);
 339 | 		quantMaxColor.m_c[3] = (uint8_t)((pHigh->m_c[3] << 1) | maxPBit);
 340 | 	}
 341 | 
 342 | 	color_quad_u8 actualMinColor = scale_color(&quantMinColor, pParams);
 343 | 	color_quad_u8 actualMaxColor = scale_color(&quantMaxColor, pParams);
 344 | 
 345 | 	const uint32_t N = pParams->m_num_selector_weights;
 346 | 
 347 | 	color_quad_u8 weightedColors[16];
 348 | 	weightedColors[0] = actualMinColor;
 349 | 	weightedColors[N - 1] = actualMaxColor;
 350 | 
 351 | 	const uint32_t nc = pParams->m_has_alpha ? 4 : 3;
 352 | 	for (uint32_t i = 1; i < (N - 1); i++)
 353 | 		for (uint32_t j = 0; j < nc; j++)
 354 | 			weightedColors[i].m_c[j] = (uint8_t)((actualMinColor.m_c[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams->m_pSelector_weights[i] + 32) >> 6);
 355 | 
 356 | 	const int lr = actualMinColor.m_c[0];
 357 | 	const int lg = actualMinColor.m_c[1];
 358 | 	const int lb = actualMinColor.m_c[2];
 359 | 	const int dr = actualMaxColor.m_c[0] - lr;
 360 | 	const int dg = actualMaxColor.m_c[1] - lg;
 361 | 	const int db = actualMaxColor.m_c[2] - lb;
 362 | 
 363 | 	uint64_t total_err = 0;
 364 | 
 365 | 	if (!pParams->m_perceptual)
 366 | 	{
 367 | 		if (pParams->m_has_alpha)
 368 | 		{
 369 | 			const int la = actualMinColor.m_c[3];
 370 | 			const int da = actualMaxColor.m_c[3] - la;
 371 | 
 372 | 			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f);
 373 | 
 374 | 			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 375 | 			{
 376 | 				const color_quad_u8 *pC = &pParams->m_pPixels[i];
 377 | 				int r = pC->m_c[0];
 378 | 				int g = pC->m_c[1];
 379 | 				int b = pC->m_c[2];
 380 | 				int a = pC->m_c[3];
 381 | 
 382 | 				int best_sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f);
 383 | 				best_sel = clampi(best_sel, 1, N - 1);
 384 | 
 385 | 				uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams->m_weights);
 386 | 				uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams->m_weights);
 387 | 
 388 | 				if (err1 > err0)
 389 | 				{
 390 | 					err1 = err0;
 391 | 					--best_sel;
 392 | 				}
 393 | 				total_err += err1;
 394 | 
 395 | 				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
 396 | 			}
 397 | 		}
 398 | 		else
 399 | 		{
 400 | 			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);
 401 | 
 402 | 			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 403 | 			{
 404 | 				const color_quad_u8 *pC = &pParams->m_pPixels[i];
 405 | 				int r = pC->m_c[0];
 406 | 				int g = pC->m_c[1];
 407 | 				int b = pC->m_c[2];
 408 | 
 409 | 				int sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f);
 410 | 				sel = clampi(sel, 1, N - 1);
 411 | 
 412 | 				uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams->m_weights);
 413 | 				uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams->m_weights);
 414 | 
 415 | 				int best_sel = sel;
 416 | 				uint64_t best_err = err1;
 417 | 				if (err0 < best_err)
 418 | 				{
 419 | 					best_err = err0;
 420 | 					best_sel = sel - 1;
 421 | 				}
 422 | 
 423 | 				total_err += best_err;
 424 | 
 425 | 				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
 426 | 			}
 427 | 		}
 428 | 	}
 429 | 	else
 430 | 	{
 431 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 432 | 		{
 433 | 			uint64_t best_err = UINT64_MAX;
 434 | 			uint32_t best_sel = 0;
 435 | 
 436 | 			if (pParams->m_has_alpha)
 437 | 			{
 438 | 				for (uint32_t j = 0; j < N; j++)
 439 | 				{
 440 | 					uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC16_TRUE, pParams->m_weights);
 441 | 					if (err < best_err)
 442 | 					{
 443 | 						best_err = err;
 444 | 						best_sel = j;
 445 | 					}
 446 | 				}
 447 | 			}
 448 | 			else
 449 | 			{
 450 | 				for (uint32_t j = 0; j < N; j++)
 451 | 				{
 452 | 					uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC16_TRUE, pParams->m_weights);
 453 | 					if (err < best_err)
 454 | 					{
 455 | 						best_err = err;
 456 | 						best_sel = j;
 457 | 					}
 458 | 				}
 459 | 			}
 460 | 
 461 | 			total_err += best_err;
 462 | 
 463 | 			pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
 464 | 		}
 465 | 	}
 466 | 
 467 | 	if (total_err < pResults->m_best_overall_err)
 468 | 	{
 469 | 		pResults->m_best_overall_err = total_err;
 470 | 
 471 | 		pResults->m_low_endpoint = *pLow;
 472 | 		pResults->m_high_endpoint = *pHigh;
 473 | 
 474 | 		pResults->m_pbits[0] = pbits[0];
 475 | 		pResults->m_pbits[1] = pbits[1];
 476 | 
 477 | 		memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
 478 | 	}
 479 | 
 480 | 	return total_err;
 481 | }
 482 | 
 483 | static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, color_quad_u8 *pTrialMaxColor, const vec4F *pXl, const vec4F *pXh, uint32_t iscale)
 484 | {
 485 | 	if (mode == 1)
 486 | 	{
 487 | 		// fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps)
 488 | 		for (uint32_t i = 0; i < 3; i++)
 489 | 		{
 490 | 			if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
 491 | 			{
 492 | 				if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.0f)
 493 | 				{
 494 | 					if (pTrialMinColor->m_c[i] > (iscale >> 1))
 495 | 					{
 496 | 						if (pTrialMinColor->m_c[i] > 0)
 497 | 							pTrialMinColor->m_c[i]--;
 498 | 						else
 499 | 							if (pTrialMaxColor->m_c[i] < iscale)
 500 | 								pTrialMaxColor->m_c[i]++;
 501 | 					}
 502 | 					else
 503 | 					{
 504 | 						if (pTrialMaxColor->m_c[i] < iscale)
 505 | 							pTrialMaxColor->m_c[i]++;
 506 | 						else if (pTrialMinColor->m_c[i] > 0)
 507 | 							pTrialMinColor->m_c[i]--;
 508 | 					}
 509 | 				}
 510 | 			}
 511 | 		}
 512 | 	}
 513 | }
 514 | 
 515 | static uint64_t find_optimal_solution(uint32_t mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
 516 | {
 517 | 	vec4F_saturate_in_place(&xl); vec4F_saturate_in_place(&xh);
 518 | 
 519 | 	if (pParams->m_has_pbits)
 520 | 	{
 521 | 		const int iscalep = (1 << (pParams->m_comp_bits + 1)) - 1;
 522 | 		const float scalep = (float)iscalep;
 523 | 
 524 | 		const int32_t totalComps = pParams->m_has_alpha ? 4 : 3;
 525 | 
 526 | 		uint32_t best_pbits[2];
 527 | 		color_quad_u8 bestMinColor, bestMaxColor;
 528 | 
 529 | 		if (!pParams->m_endpoints_share_pbit)
 530 | 		{
 531 | 			float best_err0 = 1e+9;
 532 | 			float best_err1 = 1e+9;
 533 | 
 534 | 			for (int p = 0; p < 2; p++)
 535 | 			{
 536 | 				color_quad_u8 xMinColor, xMaxColor;
 537 | 
 538 | 				// Notes: The pbit controls which quantization intervals are selected.
 539 | 				// total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc.
 540 | 				// pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value
 541 | 				// rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5)
 542 | 				// rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5)
 543 | 				for (uint32_t c = 0; c < 4; c++)
 544 | 				{
 545 | 					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
 546 | 					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
 547 | 				}
 548 | 
 549 | 				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
 550 | 				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
 551 | 
 552 | 				float err0 = 0, err1 = 0;
 553 | 				for (int i = 0; i < totalComps; i++)
 554 | 				{
 555 | 					err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f);
 556 | 					err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f);
 557 | 				}
 558 | 
 559 | 				if (err0 < best_err0)
 560 | 				{
 561 | 					best_err0 = err0;
 562 | 					best_pbits[0] = p;
 563 | 
 564 | 					bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
 565 | 					bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
 566 | 					bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
 567 | 					bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
 568 | 				}
 569 | 
 570 | 				if (err1 < best_err1)
 571 | 				{
 572 | 					best_err1 = err1;
 573 | 					best_pbits[1] = p;
 574 | 
 575 | 					bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
 576 | 					bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
 577 | 					bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
 578 | 					bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
 579 | 				}
 580 | 			}
 581 | 		}
 582 | 		else
 583 | 		{
 584 | 			// Endpoints share pbits
 585 | 			float best_err = 1e+9;
 586 | 
 587 | 			for (int p = 0; p < 2; p++)
 588 | 			{
 589 | 				color_quad_u8 xMinColor, xMaxColor;
 590 | 				for (uint32_t c = 0; c < 4; c++)
 591 | 				{
 592 | 					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
 593 | 					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
 594 | 				}
 595 | 
 596 | 				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
 597 | 				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
 598 | 
 599 | 				float err = 0;
 600 | 				for (int i = 0; i < totalComps; i++)
 601 | 					err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]);
 602 | 
 603 | 				if (err < best_err)
 604 | 				{
 605 | 					best_err = err;
 606 | 					best_pbits[0] = p;
 607 | 					best_pbits[1] = p;
 608 | 					for (uint32_t j = 0; j < 4; j++)
 609 | 					{
 610 | 						bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
 611 | 						bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
 612 | 					}
 613 | 				}
 614 | 			}
 615 | 		}
 616 | 
 617 | 		fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1);
 618 | 
 619 | 		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&bestMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&bestMaxColor, &pResults->m_high_endpoint) || (best_pbits[0] != pResults->m_pbits[0]) || (best_pbits[1] != pResults->m_pbits[1]))
 620 | 			evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults);
 621 | 	}
 622 | 	else
 623 | 	{
 624 | 		const int iscale = (1 << pParams->m_comp_bits) - 1;
 625 | 		const float scale = (float)iscale;
 626 | 
 627 | 		color_quad_u8 trialMinColor, trialMaxColor;
 628 | 		color_quad_u8_set_clamped(&trialMinColor, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f));
 629 | 		color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f));
 630 | 
 631 | 		fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale);
 632 | 
 633 | 		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
 634 | 			evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
 635 | 	}
 636 | 
 637 | 	return pResults->m_best_overall_err;
 638 | }
 639 | 
 640 | static uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, const bc7enc16_compress_block_params *pComp_params)
 641 | {
 642 | 	assert((mode == 6) || (!pParams->m_has_alpha));
 643 | 
 644 | 	pResults->m_best_overall_err = UINT64_MAX;
 645 | 
 646 | 	// If the partition's colors are all the same in mode 1, then just pack them as a single color.
 647 | 	if (mode == 1)
 648 | 	{
 649 | 		const uint32_t cr = pParams->m_pPixels[0].m_c[0], cg = pParams->m_pPixels[0].m_c[1], cb = pParams->m_pPixels[0].m_c[2];
 650 | 
 651 | 		bc7enc16_bool allSame = BC7ENC16_TRUE;
 652 | 		for (uint32_t i = 1; i < pParams->m_num_pixels; i++)
 653 | 		{
 654 | 			if ((cr != pParams->m_pPixels[i].m_c[0]) || (cg != pParams->m_pPixels[i].m_c[1]) || (cb != pParams->m_pPixels[i].m_c[2]))
 655 | 			{
 656 | 				allSame = BC7ENC16_FALSE;
 657 | 				break;
 658 | 			}
 659 | 		}
 660 | 
 661 | 		if (allSame)
 662 | 			return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
 663 | 	}
 664 | 
 665 | 	// Compute partition's mean color and principle axis.
 666 | 	vec4F meanColor, axis;
 667 | 	vec4F_set_scalar(&meanColor, 0.0f);
 668 | 
 669 | 	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 670 | 	{
 671 | 		vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
 672 | 		meanColor = vec4F_add(&meanColor, &color);
 673 | 	}
 674 | 
 675 | 	vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels));
 676 | 
 677 | 	meanColor = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels * 255.0f));
 678 | 	vec4F_saturate_in_place(&meanColor);
 679 | 
 680 | 	if (pParams->m_has_alpha)
 681 | 	{
 682 | 		// Use incremental PCA for RGBA PCA, because it's simple.
 683 | 		vec4F_set_scalar(&axis, 0.0f);
 684 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 685 | 		{
 686 | 			vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
 687 | 			color = vec4F_sub(&color, &meanColorScaled);
 688 | 			vec4F a = vec4F_mul(&color, color.m_c[0]);
 689 | 			vec4F b = vec4F_mul(&color, color.m_c[1]);
 690 | 			vec4F c = vec4F_mul(&color, color.m_c[2]);
 691 | 			vec4F d = vec4F_mul(&color, color.m_c[3]);
 692 | 			vec4F n = i ? axis : color;
 693 | 			vec4F_normalize_in_place(&n);
 694 | 			axis.m_c[0] += vec4F_dot(&a, &n);
 695 | 			axis.m_c[1] += vec4F_dot(&b, &n);
 696 | 			axis.m_c[2] += vec4F_dot(&c, &n);
 697 | 			axis.m_c[3] += vec4F_dot(&d, &n);
 698 | 		}
 699 | 		vec4F_normalize_in_place(&axis);
 700 | 	}
 701 | 	else
 702 | 	{
 703 | 		// Use covar technique for RGB PCA, because it doesn't require per-pixel normalization.
 704 | 		float cov[6] = { 0, 0, 0, 0, 0, 0 };
 705 | 
 706 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 707 | 		{
 708 | 			const color_quad_u8 *pV = &pParams->m_pPixels[i];
 709 | 			float r = pV->m_c[0] - meanColorScaled.m_c[0];
 710 | 			float g = pV->m_c[1] - meanColorScaled.m_c[1];
 711 | 			float b = pV->m_c[2] - meanColorScaled.m_c[2];
 712 | 			cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b;
 713 | 		}
 714 | 
 715 | 		float vfr = .9f, vfg = 1.0f, vfb = .7f;
 716 | 		for (uint32_t iter = 0; iter < 3; iter++)
 717 | 		{
 718 | 			float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2];
 719 | 			float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4];
 720 | 			float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5];
 721 | 
 722 | 			float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
 723 | 			if (m > 1e-10f)
 724 | 			{
 725 | 				m = 1.0f / m;
 726 | 				r *= m; g *= m;	b *= m;
 727 | 			}
 728 | 
 729 | 			vfr = r; vfg = g; vfb = b;
 730 | 		}
 731 | 
 732 | 		float len = vfr*vfr + vfg*vfg + vfb*vfb;
 733 | 		if (len < 1e-10f)
 734 | 			vec4F_set_scalar(&axis, 0.0f);
 735 | 		else
 736 | 		{
 737 | 			len = 1.0f / sqrtf(len);
 738 | 			vfr *= len; vfg *= len; vfb *= len;
 739 | 			vec4F_set(&axis, vfr, vfg, vfb, 0);
 740 | 		}
 741 | 	}
 742 | 
 743 | 	if (vec4F_dot(&axis, &axis) < .5f)
 744 | 	{
 745 | 		if (pParams->m_perceptual)
 746 | 			vec4F_set(&axis, .213f, .715f, .072f, pParams->m_has_alpha ? .715f : 0);
 747 | 		else
 748 | 			vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams->m_has_alpha ? 1.0f : 0);
 749 | 		vec4F_normalize_in_place(&axis);
 750 | 	}
 751 | 
 752 | 	float l = 1e+9f, h = -1e+9f;
 753 | 
 754 | 	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 755 | 	{
 756 | 		vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
 757 | 
 758 | 		vec4F q = vec4F_sub(&color, &meanColorScaled);
 759 | 		float d = vec4F_dot(&q, &axis);
 760 | 
 761 | 		l = minimumf(l, d);
 762 | 		h = maximumf(h, d);
 763 | 	}
 764 | 
 765 | 	l *= (1.0f / 255.0f);
 766 | 	h *= (1.0f / 255.0f);
 767 | 
 768 | 	vec4F b0 = vec4F_mul(&axis, l);
 769 | 	vec4F b1 = vec4F_mul(&axis, h);
 770 | 	vec4F c0 = vec4F_add(&meanColor, &b0);
 771 | 	vec4F c1 = vec4F_add(&meanColor, &b1);
 772 | 	vec4F minColor = vec4F_saturate(&c0);
 773 | 	vec4F maxColor = vec4F_saturate(&c1);
 774 | 
 775 | 	vec4F whiteVec;
 776 | 	vec4F_set_scalar(&whiteVec, 1.0f);
 777 | 	if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec))
 778 | 	{
 779 | 		vec4F temp = minColor;
 780 | 		minColor = maxColor;
 781 | 		maxColor = temp;
 782 | 	}
 783 | 	// First find a solution using the block's PCA.
 784 | 	if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults))
 785 | 		return 0;
 786 | 
 787 | 	if (pComp_params->m_try_least_squares)
 788 | 	{
 789 | 		// Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors.
 790 | 		vec4F xl, xh;
 791 | 		vec4F_set_scalar(&xl, 0.0f);
 792 | 		vec4F_set_scalar(&xh, 0.0f);
 793 | 		if (pParams->m_has_alpha)
 794 | 			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 795 | 		else
 796 | 			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 797 | 
 798 | 		xl = vec4F_mul(&xl, (1.0f / 255.0f));
 799 | 		xh = vec4F_mul(&xh, (1.0f / 255.0f));
 800 | 
 801 | 		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
 802 | 			return 0;
 803 | 	}
 804 | 
 805 | 	if (pComp_params->m_uber_level > 0)
 806 | 	{
 807 | 		// In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors,
 808 | 		// then try decrementing the selectrors, then try both.
 809 | 		uint8_t selectors_temp[16], selectors_temp1[16];
 810 | 		memcpy(selectors_temp, pResults->m_pSelectors, pParams->m_num_pixels);
 811 | 
 812 | 		const int max_selector = pParams->m_num_selector_weights - 1;
 813 | 
 814 | 		uint32_t min_sel = 16;
 815 | 		uint32_t max_sel = 0;
 816 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 817 | 		{
 818 | 			uint32_t sel = selectors_temp[i];
 819 | 			min_sel = minimumu(min_sel, sel);
 820 | 			max_sel = maximumu(max_sel, sel);
 821 | 		}
 822 | 
 823 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 824 | 		{
 825 | 			uint32_t sel = selectors_temp[i];
 826 | 			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
 827 | 				sel++;
 828 | 			selectors_temp1[i] = (uint8_t)sel;
 829 | 		}
 830 | 
 831 | 		vec4F xl, xh;
 832 | 		vec4F_set_scalar(&xl, 0.0f);
 833 | 		vec4F_set_scalar(&xh, 0.0f);
 834 | 		if (pParams->m_has_alpha)
 835 | 			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 836 | 		else
 837 | 			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 838 | 
 839 | 		xl = vec4F_mul(&xl, (1.0f / 255.0f));
 840 | 		xh = vec4F_mul(&xh, (1.0f / 255.0f));
 841 | 
 842 | 		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
 843 | 			return 0;
 844 | 
 845 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 846 | 		{
 847 | 			uint32_t sel = selectors_temp[i];
 848 | 			if ((sel == max_sel) && (sel > 0))
 849 | 				sel--;
 850 | 			selectors_temp1[i] = (uint8_t)sel;
 851 | 		}
 852 | 
 853 | 		if (pParams->m_has_alpha)
 854 | 			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 855 | 		else
 856 | 			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 857 | 
 858 | 		xl = vec4F_mul(&xl, (1.0f / 255.0f));
 859 | 		xh = vec4F_mul(&xh, (1.0f / 255.0f));
 860 | 
 861 | 		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
 862 | 			return 0;
 863 | 
 864 | 		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 865 | 		{
 866 | 			uint32_t sel = selectors_temp[i];
 867 | 			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
 868 | 				sel++;
 869 | 			else if ((sel == max_sel) && (sel > 0))
 870 | 				sel--;
 871 | 			selectors_temp1[i] = (uint8_t)sel;
 872 | 		}
 873 | 
 874 | 		if (pParams->m_has_alpha)
 875 | 			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 876 | 		else
 877 | 			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 878 | 
 879 | 		xl = vec4F_mul(&xl, (1.0f / 255.0f));
 880 | 		xh = vec4F_mul(&xh, (1.0f / 255.0f));
 881 | 
 882 | 		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
 883 | 			return 0;
 884 | 
 885 | 		// In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another.
 886 | 		const uint32_t uber_err_thresh = (pParams->m_num_pixels * 56) >> 4;
 887 | 		if ((pComp_params->m_uber_level >= 2) && (pResults->m_best_overall_err > uber_err_thresh))
 888 | 		{
 889 | 			const int Q = (pComp_params->m_uber_level >= 4) ? (pComp_params->m_uber_level - 2) : 1;
 890 | 			for (int ly = -Q; ly <= 1; ly++)
 891 | 			{
 892 | 				for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++)
 893 | 				{
 894 | 					if ((ly == 0) && (hy == max_selector))
 895 | 						continue;
 896 | 
 897 | 					for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
 898 | 						selectors_temp1[i] = (uint8_t)clampf(floorf((float)max_selector * ((float)selectors_temp[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_selector);
 899 | 
 900 | 					//vec4F xl, xh;
 901 | 					vec4F_set_scalar(&xl, 0.0f);
 902 | 					vec4F_set_scalar(&xh, 0.0f);
 903 | 					if (pParams->m_has_alpha)
 904 | 						compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 905 | 					else
 906 | 						compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
 907 | 
 908 | 					xl = vec4F_mul(&xl, (1.0f / 255.0f));
 909 | 					xh = vec4F_mul(&xh, (1.0f / 255.0f));
 910 | 
 911 | 					if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
 912 | 						return 0;
 913 | 				}
 914 | 			}
 915 | 		}
 916 | 	}
 917 | 
 918 | 	if (mode == 1)
 919 | 	{
 920 | 		// Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean.
 921 | 		color_cell_compressor_results avg_results = *pResults;
 922 | 		const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
 923 | 		uint64_t avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
 924 | 		if (avg_err < pResults->m_best_overall_err)
 925 | 		{
 926 | 			*pResults = avg_results;
 927 | 			memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
 928 | 			pResults->m_best_overall_err = avg_err;
 929 | 		}
 930 | 	}
 931 | 
 932 | 	return pResults->m_best_overall_err;
 933 | }
 934 | 
 935 | static uint64_t color_cell_compression_est(uint32_t num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far)
 936 | {
 937 | 	// Find RGB bounds as an approximation of the block's principle axis
 938 | 	uint32_t lr = 255, lg = 255, lb = 255;
 939 | 	uint32_t hr = 0, hg = 0, hb = 0;
 940 | 	for (uint32_t i = 0; i < num_pixels; i++)
 941 | 	{
 942 | 		const color_quad_u8 *pC = &pPixels[i];
 943 | 		if (pC->m_c[0] < lr) lr = pC->m_c[0];
 944 | 		if (pC->m_c[1] < lg) lg = pC->m_c[1];
 945 | 		if (pC->m_c[2] < lb) lb = pC->m_c[2];
 946 | 		if (pC->m_c[0] > hr) hr = pC->m_c[0];
 947 | 		if (pC->m_c[1] > hg) hg = pC->m_c[1];
 948 | 		if (pC->m_c[2] > hb) hb = pC->m_c[2];
 949 | 	}
 950 | 
 951 | 	color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0);
 952 | 	color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0);
 953 | 
 954 | 	// Place endpoints at bbox diagonals and compute interpolated colors
 955 | 	const uint32_t N = 8;
 956 | 	color_quad_u8 weightedColors[8];
 957 | 
 958 | 	weightedColors[0] = lowColor;
 959 | 	weightedColors[N - 1] = highColor;
 960 | 	for (uint32_t i = 1; i < (N - 1); i++)
 961 | 	{
 962 | 		weightedColors[i].m_c[0] = (uint8_t)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6);
 963 | 		weightedColors[i].m_c[1] = (uint8_t)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6);
 964 | 		weightedColors[i].m_c[2] = (uint8_t)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6);
 965 | 	}
 966 | 
 967 | 	// Compute dots and thresholds
 968 | 	const int ar = highColor.m_c[0] - lowColor.m_c[0];
 969 | 	const int ag = highColor.m_c[1] - lowColor.m_c[1];
 970 | 	const int ab = highColor.m_c[2] - lowColor.m_c[2];
 971 | 
 972 | 	int dots[8];
 973 | 	for (uint32_t i = 0; i < N; i++)
 974 | 		dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab;
 975 | 
 976 | 	int thresh[8 - 1];
 977 | 	for (uint32_t i = 0; i < (N - 1); i++)
 978 | 		thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1;
 979 | 
 980 | 	uint64_t total_err = 0;
 981 | 	if (perceptual)
 982 | 	{
 983 | 		// Transform block's interpolated colors to YCbCr
 984 | 		int l1[8], cr1[8], cb1[8];
 985 | 		for (int j = 0; j < 8; j++)
 986 | 		{
 987 | 			const color_quad_u8 *pE1 = &weightedColors[j];
 988 | 			l1[j] = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37;
 989 | 			cr1[j] = ((int)pE1->m_c[0] << 9) - l1[j];
 990 | 			cb1[j] = ((int)pE1->m_c[2] << 9) - l1[j];
 991 | 		}
 992 | 
 993 | 		for (uint32_t i = 0; i < num_pixels; i++)
 994 | 		{
 995 | 			const color_quad_u8 *pC = &pPixels[i];
 996 | 
 997 | 			int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
 998 | 
 999 | 			// Find approximate selector
1000 | 			uint32_t s = 0;
1001 | 			if (d >= thresh[6])
1002 | 				s = 7;
1003 | 			else if (d >= thresh[5])
1004 | 				s = 6;
1005 | 			else if (d >= thresh[4])
1006 | 				s = 5;
1007 | 			else if (d >= thresh[3])
1008 | 				s = 4;
1009 | 			else if (d >= thresh[2])
1010 | 				s = 3;
1011 | 			else if (d >= thresh[1])
1012 | 				s = 2;
1013 | 			else if (d >= thresh[0])
1014 | 				s = 1;
1015 | 
1016 | 			// Compute error
1017 | 			const int l2 = pC->m_c[0] * 109 + pC->m_c[1] * 366 + pC->m_c[2] * 37;
1018 | 			const int cr2 = ((int)pC->m_c[0] << 9) - l2;
1019 | 			const int cb2 = ((int)pC->m_c[2] << 9) - l2;
1020 | 
1021 | 			const int dl = (l1[s] - l2) >> 8;
1022 | 			const int dcr = (cr1[s] - cr2) >> 8;
1023 | 			const int dcb = (cb1[s] - cb2) >> 8;
1024 | 
1025 | 			int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb);
1026 | 
1027 | 			total_err += ie;
1028 | 			if (total_err > best_err_so_far)
1029 | 				break;
1030 | 		}
1031 | 	}
1032 | 	else
1033 | 	{
1034 | 		for (uint32_t i = 0; i < num_pixels; i++)
1035 | 		{
1036 | 			const color_quad_u8 *pC = &pPixels[i];
1037 | 
1038 | 			int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
1039 | 
1040 | 			// Find approximate selector
1041 | 			uint32_t s = 0;
1042 | 			if (d >= thresh[6])
1043 | 				s = 7;
1044 | 			else if (d >= thresh[5])
1045 | 				s = 6;
1046 | 			else if (d >= thresh[4])
1047 | 				s = 5;
1048 | 			else if (d >= thresh[3])
1049 | 				s = 4;
1050 | 			else if (d >= thresh[2])
1051 | 				s = 3;
1052 | 			else if (d >= thresh[1])
1053 | 				s = 2;
1054 | 			else if (d >= thresh[0])
1055 | 				s = 1;
1056 | 
1057 | 			// Compute error
1058 | 			const color_quad_u8 *pE1 = &weightedColors[s];
1059 | 
1060 | 			int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
1061 | 			int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
1062 | 			int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
1063 | 
1064 | 			total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db);
1065 | 			if (total_err > best_err_so_far)
1066 | 				break;
1067 | 		}
1068 | 	}
1069 | 
1070 | 	return total_err;
1071 | }
1072 | 
1073 | // This table contains bitmasks indicating which "key" partitions must be best ranked before this partition is worth evaluating.
1074 | // We first rank the best/most used 14 partitions (sorted by usefulness), record the best one found as the key partition, then use
1075 | // that to control the other partitions to evaluate. The quality loss is ~.08 dB RGB PSNR, the perf gain is up to ~11% (at uber level 0).
1076 | static const uint32_t g_partition_predictors[35] =
1077 | {
1078 | 	UINT32_MAX,
1079 | 	UINT32_MAX,
1080 | 	UINT32_MAX,
1081 | 	UINT32_MAX,
1082 | 	UINT32_MAX,
1083 | 	(1 << 1) | (1 << 2) | (1 << 8),
1084 | 	(1 << 1) | (1 << 3) | (1 << 7),
1085 | 	UINT32_MAX,
1086 | 	UINT32_MAX,
1087 | 	(1 << 2) | (1 << 8) | (1 << 16),
1088 | 	(1 << 7) | (1 << 3) | (1 << 15),
1089 | 	UINT32_MAX,
1090 | 	(1 << 8) | (1 << 14) | (1 << 16),
1091 | 	(1 << 7) | (1 << 14) | (1 << 15),
1092 | 	UINT32_MAX,
1093 | 	UINT32_MAX,
1094 | 	UINT32_MAX,
1095 | 	UINT32_MAX,
1096 | 	(1 << 14) | (1 << 15),
1097 | 	(1 << 16) | (1 << 22) | (1 << 14),
1098 | 	(1 << 17) | (1 << 24) | (1 << 14),
1099 | 	(1 << 2) | (1 << 14) | (1 << 15) | (1 << 1),
1100 | 	UINT32_MAX,
1101 | 	(1 << 1) | (1 << 3) | (1 << 14) | (1 << 16) | (1 << 22),
1102 | 	UINT32_MAX,
1103 | 	(1 << 1) | (1 << 2) | (1 << 15) | (1 << 17) | (1 << 24),
1104 | 	(1 << 1) | (1 << 3) | (1 << 22),
1105 | 	UINT32_MAX,
1106 | 	UINT32_MAX,
1107 | 	UINT32_MAX,
1108 | 	(1 << 14) | (1 << 15) | (1 << 16) | (1 << 17),
1109 | 	UINT32_MAX,
1110 | 	UINT32_MAX,
1111 | 	(1 << 1) | (1 << 2) | (1 << 3) | (1 << 27) | (1 << 4) | (1 << 24),
1112 | 	(1 << 14) | (1 << 15) | (1 << 16) | (1 << 11) | (1 << 17) | (1 << 27)
1113 | };
1114 | 
1115 | // Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each.
1116 | static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, uint32_t pweights[4])
1117 | {
1118 | 	const uint32_t total_partitions = minimumu(pComp_params->m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1);
1119 | 	if (total_partitions <= 1)
1120 | 		return 0;
1121 | 
1122 | 	uint64_t best_err = UINT64_MAX;
1123 | 	uint32_t best_partition = 0;
1124 | 
1125 | 	// Partition order sorted by usage frequency across a large test corpus. Pattern 34 (checkerboard) must appear in slot 34.
1126 | 	// Using a sorted order allows the user to decrease the # of partitions to scan with minimal loss in quality.
1127 | 	static const uint8_t s_sorted_partition_order[64] =
1128 | 	{
1129 | 		1 - 1, 14 - 1, 2 - 1, 3 - 1, 16 - 1, 15 - 1, 11 - 1, 17 - 1,
1130 | 		4 - 1, 24 - 1, 27 - 1, 7 - 1, 8 - 1, 22 - 1, 20 - 1, 30 - 1,
1131 | 		9 - 1, 5 - 1, 10 - 1, 21 - 1, 6 - 1, 32 - 1, 23 - 1, 18 - 1,
1132 | 		19 - 1, 12 - 1, 13 - 1, 31 - 1, 25 - 1, 26 - 1, 29 - 1, 28 - 1,
1133 | 		33 - 1, 34 - 1, 35 - 1, 46 - 1, 47 - 1, 52 - 1, 50 - 1, 51 - 1,
1134 | 		49 - 1, 39 - 1, 40 - 1, 38 - 1, 54 - 1, 53 - 1, 55 - 1, 37 - 1,
1135 | 		58 - 1, 59 - 1, 56 - 1, 42 - 1, 41 - 1, 43 - 1, 44 - 1, 60 - 1,
1136 | 		45 - 1, 57 - 1, 48 - 1, 36 - 1, 61 - 1, 64 - 1, 63 - 1, 62 - 1
1137 | 	};
1138 | 
1139 | 	assert(s_sorted_partition_order[34] == 34);
1140 | 
1141 | 	int best_key_partition = 0;
1142 | 
1143 | 	for (uint32_t partition_iter = 0; (partition_iter < total_partitions) && (best_err > 0); partition_iter++)
1144 | 	{
1145 | 		const uint32_t partition = s_sorted_partition_order[partition_iter];
1146 | 
1147 | 		// Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14.
1148 | 		if (pComp_params->m_mode1_partition_estimation_filterbank)
1149 | 		{
1150 | 			if ((partition_iter >= 14) && (partition_iter <= 34))
1151 | 			{
1152 | 				const uint32_t best_key_partition_bitmask = 1 << (best_key_partition + 1);
1153 | 				if ((g_partition_predictors[partition] & best_key_partition_bitmask) == 0)
1154 | 				{
1155 | 					if (partition_iter == 34)
1156 | 						break;
1157 | 
1158 | 					continue;
1159 | 				}
1160 | 			}
1161 | 		}
1162 | 
1163 | 		const uint8_t *pPartition = &g_bc7_partition2[partition * 16];
1164 | 
1165 | 		color_quad_u8 subset_colors[2][16];
1166 | 		uint32_t subset_total_colors[2] = { 0, 0 };
1167 | 		for (uint32_t index = 0; index < 16; index++)
1168 | 			subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index];
1169 | 
1170 | 		uint64_t total_subset_err = 0;
1171 | 		for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
1172 | 			total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params->m_perceptual, pweights, best_err);
1173 | 
1174 | 		if (total_subset_err < best_err)
1175 | 		{
1176 | 			best_err = total_subset_err;
1177 | 			best_partition = partition;
1178 | 		}
1179 | 
1180 | 		// If the checkerboard pattern doesn't get the highest ranking vs. the previous (lower frequency) patterns, then just stop now because statistically the subsequent patterns won't do well either.
1181 | 		if ((partition == 34) && (best_partition != 34))
1182 | 			break;
1183 | 
1184 | 		if (partition_iter == 13)
1185 | 			best_key_partition = best_partition;
1186 | 
1187 | 	} // partition
1188 | 
1189 | 	return best_partition;
1190 | }
1191 | 
1192 | static void set_block_bits(uint8_t *pBytes, uint32_t val, uint32_t num_bits, uint32_t *pCur_ofs)
1193 | {
1194 | 	assert((num_bits <= 32) && (val < (1ULL << num_bits)));
1195 | 	while (num_bits)
1196 | 	{
1197 | 		const uint32_t n = minimumu(8 - (*pCur_ofs & 7), num_bits);
1198 | 		pBytes[*pCur_ofs >> 3] |= (uint8_t)(val << (*pCur_ofs & 7));
1199 | 		val >>= n;
1200 | 		num_bits -= n;
1201 | 		*pCur_ofs += n;
1202 | 	}
1203 | 	assert(*pCur_ofs <= 128);
1204 | }
1205 | 
1206 | typedef struct
1207 | {
1208 | 	uint32_t m_mode;
1209 | 	uint32_t m_partition;
1210 | 	uint8_t m_selectors[16];
1211 | 	color_quad_u8 m_low[2];
1212 | 	color_quad_u8 m_high[2];
1213 | 	uint32_t m_pbits[2][2];
1214 | } bc7_optimization_results;
1215 | 
1216 | static void encode_bc7_block(void *pBlock, const bc7_optimization_results *pResults)
1217 | {
1218 | 	const uint32_t best_mode = pResults->m_mode;
1219 | 	const uint32_t total_subsets = g_bc7_num_subsets[best_mode];
1220 | 	const uint32_t total_partitions = 1 << g_bc7_partition_bits[best_mode];
1221 | 	const uint8_t *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults->m_partition * 16] : &g_bc7_partition1[0];
1222 | 
1223 | 	uint8_t color_selectors[16];
1224 | 	memcpy(color_selectors, pResults->m_selectors, 16);
1225 | 
1226 | 	color_quad_u8 low[2], high[2];
1227 | 	memcpy(low, pResults->m_low, sizeof(low));
1228 | 	memcpy(high, pResults->m_high, sizeof(high));
1229 | 
1230 | 	uint32_t pbits[2][2];
1231 | 	memcpy(pbits, pResults->m_pbits, sizeof(pbits));
1232 | 
1233 | 	int anchor[2] = { -1, -1 };
1234 | 
1235 | 	for (uint32_t k = 0; k < total_subsets; k++)
1236 | 	{
1237 | 		const uint32_t anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults->m_partition] : 0;
1238 | 		anchor[k] = anchor_index;
1239 | 
1240 | 		const uint32_t color_index_bits = get_bc7_color_index_size(best_mode, 0);
1241 | 		const uint32_t num_color_indices = 1 << color_index_bits;
1242 | 
1243 | 		if (color_selectors[anchor_index] & (num_color_indices >> 1))
1244 | 		{
1245 | 			for (uint32_t i = 0; i < 16; i++)
1246 | 				if (pPartition[i] == k)
1247 | 					color_selectors[i] = (uint8_t)((num_color_indices - 1) - color_selectors[i]);
1248 | 
1249 | 			color_quad_u8 tmp = low[k];
1250 | 			low[k] = high[k];
1251 | 			high[k] = tmp;
1252 | 
1253 | 			if (!g_bc7_mode_has_shared_p_bits[best_mode])
1254 | 			{
1255 | 				uint32_t t = pbits[k][0];
1256 | 				pbits[k][0] = pbits[k][1];
1257 | 				pbits[k][1] = t;
1258 | 			}
1259 | 		}
1260 | 	}
1261 | 
1262 | 	uint8_t *pBlock_bytes = (uint8_t *)(pBlock);
1263 | 	memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE);
1264 | 
1265 | 	uint32_t cur_bit_ofs = 0;
1266 | 	set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs);
1267 | 
1268 | 	if (total_partitions > 1)
1269 | 		set_block_bits(pBlock_bytes, pResults->m_partition, 6, &cur_bit_ofs);
1270 | 
1271 | 	const uint32_t total_comps = (best_mode >= 4) ? 4 : 3;
1272 | 	for (uint32_t comp = 0; comp < total_comps; comp++)
1273 | 	{
1274 | 		for (uint32_t subset = 0; subset < total_subsets; subset++)
1275 | 		{
1276 | 			set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
1277 | 			set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
1278 | 		}
1279 | 	}
1280 | 
1281 | 	for (uint32_t subset = 0; subset < total_subsets; subset++)
1282 | 	{
1283 | 		set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs);
1284 | 		if (!g_bc7_mode_has_shared_p_bits[best_mode])
1285 | 			set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs);
1286 | 	}
1287 | 
1288 | 	for (int idx = 0; idx < 16; idx++)
1289 | 	{
1290 | 		uint32_t n = get_bc7_color_index_size(best_mode, 0);
1291 | 		if ((idx == anchor[0]) || (idx == anchor[1]))
1292 | 			n--;
1293 | 		set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs);
1294 | 	}
1295 | 
1296 | 	assert(cur_bit_ofs == 128);
1297 | }
1298 | 
1299 | static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, color_cell_compressor_params *pParams)
1300 | {
1301 | 	color_cell_compressor_results results6;
1302 | 
1303 | 	pParams->m_pSelector_weights = g_bc7_weights4;
1304 | 	pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x;
1305 | 	pParams->m_num_selector_weights = 16;
1306 | 	pParams->m_comp_bits = 7;
1307 | 	pParams->m_has_pbits = BC7ENC16_TRUE;
1308 | 	pParams->m_has_alpha = BC7ENC16_TRUE;
1309 | 	pParams->m_perceptual = pComp_params->m_perceptual;
1310 | 	pParams->m_num_pixels = 16;
1311 | 	pParams->m_pPixels = pPixels;
1312 | 
1313 | 	bc7_optimization_results opt_results;
1314 | 	results6.m_pSelectors = opt_results.m_selectors;
1315 | 
1316 | 	uint8_t selectors_temp[16];
1317 | 	results6.m_pSelectors_temp = selectors_temp;
1318 | 
1319 | 	color_cell_compression(6, pParams, &results6, pComp_params);
1320 | 
1321 | 	opt_results.m_mode = 6;
1322 | 	opt_results.m_partition = 0;
1323 | 	opt_results.m_low[0] = results6.m_low_endpoint;
1324 | 	opt_results.m_high[0] = results6.m_high_endpoint;
1325 | 	opt_results.m_pbits[0][0] = results6.m_pbits[0];
1326 | 	opt_results.m_pbits[0][1] = results6.m_pbits[1];
1327 | 
1328 | 	encode_bc7_block(pBlock, &opt_results);
1329 | }
1330 | 
1331 | static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, color_cell_compressor_params *pParams)
1332 | {
1333 | 	uint8_t selectors_temp[16];
1334 | 
1335 | 	// Mode 6
1336 | 	bc7_optimization_results opt_results;
1337 | 
1338 | 	pParams->m_pSelector_weights = g_bc7_weights4;
1339 | 	pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x;
1340 | 	pParams->m_num_selector_weights = 16;
1341 | 	pParams->m_comp_bits = 7;
1342 | 	pParams->m_has_pbits = BC7ENC16_TRUE;
1343 | 	pParams->m_endpoints_share_pbit = BC7ENC16_FALSE;
1344 | 	pParams->m_perceptual = pComp_params->m_perceptual;
1345 | 	pParams->m_num_pixels = 16;
1346 | 	pParams->m_pPixels = pPixels;
1347 | 	pParams->m_has_alpha = BC7ENC16_FALSE;
1348 | 
1349 | 	color_cell_compressor_results results6;
1350 | 	results6.m_pSelectors = opt_results.m_selectors;
1351 | 	results6.m_pSelectors_temp = selectors_temp;
1352 | 
1353 | 	uint64_t best_err = color_cell_compression(6, pParams, &results6, pComp_params);
1354 | 
1355 | 	opt_results.m_mode = 6;
1356 | 	opt_results.m_partition = 0;
1357 | 	opt_results.m_low[0] = results6.m_low_endpoint;
1358 | 	opt_results.m_high[0] = results6.m_high_endpoint;
1359 | 	opt_results.m_pbits[0][0] = results6.m_pbits[0];
1360 | 	opt_results.m_pbits[0][1] = results6.m_pbits[1];
1361 | 
1362 | 	// Mode 1
1363 | 	if ((best_err > 0) && (pComp_params->m_max_partitions_mode1 > 0))
1364 | 	{
1365 | 		const uint32_t trial_partition = estimate_partition(pPixels, pComp_params, pParams->m_weights);
1366 | 		pParams->m_pSelector_weights = g_bc7_weights3;
1367 | 		pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights3x;
1368 | 		pParams->m_num_selector_weights = 8;
1369 | 		pParams->m_comp_bits = 6;
1370 | 		pParams->m_has_pbits = BC7ENC16_TRUE;
1371 | 		pParams->m_endpoints_share_pbit = BC7ENC16_TRUE;
1372 | 
1373 | 		const uint8_t *pPartition = &g_bc7_partition2[trial_partition * 16];
1374 | 
1375 | 		color_quad_u8 subset_colors[2][16];
1376 | 
1377 | 		uint32_t subset_total_colors1[2] = { 0, 0 };
1378 | 
1379 | 		uint8_t subset_pixel_index1[2][16];
1380 | 		uint8_t subset_selectors1[2][16];
1381 | 		color_cell_compressor_results subset_results1[2];
1382 | 
1383 | 		for (uint32_t idx = 0; idx < 16; idx++)
1384 | 		{
1385 | 			const uint32_t p = pPartition[idx];
1386 | 			subset_colors[p][subset_total_colors1[p]] = pPixels[idx];
1387 | 			subset_pixel_index1[p][subset_total_colors1[p]] = (uint8_t)idx;
1388 | 			subset_total_colors1[p]++;
1389 | 		}
1390 | 
1391 | 		uint64_t trial_err = 0;
1392 | 		for (uint32_t subset = 0; subset < 2; subset++)
1393 | 		{
1394 | 			pParams->m_num_pixels = subset_total_colors1[subset];
1395 | 			pParams->m_pPixels = &subset_colors[subset][0];
1396 | 
1397 | 			color_cell_compressor_results *pResults = &subset_results1[subset];
1398 | 			pResults->m_pSelectors = &subset_selectors1[subset][0];
1399 | 			pResults->m_pSelectors_temp = selectors_temp;
1400 | 			uint64_t err = color_cell_compression(1, pParams, pResults, pComp_params);
1401 | 			trial_err += err;
1402 | 			if (trial_err > best_err)
1403 | 				break;
1404 | 
1405 | 		} // subset
1406 | 
1407 | 		if (trial_err < best_err)
1408 | 		{
1409 | 			best_err = trial_err;
1410 | 			opt_results.m_mode = 1;
1411 | 			opt_results.m_partition = trial_partition;
1412 | 			for (uint32_t subset = 0; subset < 2; subset++)
1413 | 			{
1414 | 				for (uint32_t i = 0; i < subset_total_colors1[subset]; i++)
1415 | 					opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i];
1416 | 				opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint;
1417 | 				opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint;
1418 | 				opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0];
1419 | 			}
1420 | 		}
1421 | 	}
1422 | 
1423 | 	encode_bc7_block(pBlock, &opt_results);
1424 | }
1425 | 
1426 | bc7enc16_bool bc7enc16_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc16_compress_block_params *pComp_params)
1427 | {
1428 | 	assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0);
1429 | 
1430 | 	const color_quad_u8 *pPixels = (const color_quad_u8 *)(pPixelsRGBA);
1431 | 
1432 | 	color_cell_compressor_params params;
1433 | 	if (pComp_params->m_perceptual)
1434 | 	{
1435 | 		// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
1436 | 		const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f));
1437 | 		const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f));
1438 | 		params.m_weights[0] = (int)(pComp_params->m_weights[0] * 4.0f);
1439 | 		params.m_weights[1] = (int)(pComp_params->m_weights[1] * 4.0f * pr_weight);
1440 | 		params.m_weights[2] = (int)(pComp_params->m_weights[2] * 4.0f * pb_weight);
1441 | 		params.m_weights[3] = pComp_params->m_weights[3] * 4;
1442 | 	}
1443 | 	else
1444 | 		memcpy(params.m_weights, pComp_params->m_weights, sizeof(params.m_weights));
1445 | 
1446 | 	for (uint32_t i = 0; i < 16; i++)
1447 | 	{
1448 | 		if (pPixels[i].m_c[3] < 255)
1449 | 		{
1450 | 			handle_alpha_block(pBlock, pPixels, pComp_params, &params);
1451 | 			return BC7ENC16_TRUE;
1452 | 		}
1453 | 	}
1454 | 	handle_opaque_block(pBlock, pPixels, pComp_params, &params);
1455 | 	return BC7ENC16_FALSE;
1456 | }
1457 | 
1458 | /*
1459 | ------------------------------------------------------------------------------
1460 | This software is available under 2 licenses -- choose whichever you prefer.
1461 | ------------------------------------------------------------------------------
1462 | ALTERNATIVE A - MIT License
1463 | Copyright(c) 2018 Richard Geldreich, Jr.
1464 | Permission is hereby granted, free of charge, to any person obtaining a copy of
1465 | this software and associated documentation files(the "Software"), to deal in
1466 | the Software without restriction, including without limitation the rights to
1467 | use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
1468 | of the Software, and to permit persons to whom the Software is furnished to do
1469 | so, subject to the following conditions :
1470 | The above copyright notice and this permission notice shall be included in all
1471 | copies or substantial portions of the Software.
1472 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1473 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1474 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
1475 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1476 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1477 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1478 | SOFTWARE.
1479 | ------------------------------------------------------------------------------
1480 | ALTERNATIVE B - Public Domain(www.unlicense.org)
1481 | This is free and unencumbered software released into the public domain.
1482 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
1483 | software, either in source code form or as a compiled binary, for any purpose,
1484 | commercial or non - commercial, and by any means.
1485 | In jurisdictions that recognize copyright laws, the author or authors of this
1486 | software dedicate any and all copyright interest in the software to the public
1487 | domain.We make this dedication for the benefit of the public at large and to
1488 | the detriment of our heirs and successors.We intend this dedication to be an
1489 | overt act of relinquishment in perpetuity of all present and future rights to
1490 | this software under copyright law.
1491 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1492 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1493 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
1494 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
1495 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1496 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1497 | ------------------------------------------------------------------------------
1498 | */
1499 | 


--------------------------------------------------------------------------------
/bc7enc16.h:
--------------------------------------------------------------------------------
 1 | // File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c)
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | 
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | 
 9 | #define BC7ENC16_BLOCK_SIZE (16)
10 | #define BC7ENC16_MAX_PARTITIONS1 (64)
11 | #define BC7ENC16_MAX_UBER_LEVEL (4)
12 | 
13 | typedef uint8_t bc7enc16_bool;
14 | #define BC7ENC16_TRUE (1)
15 | #define BC7ENC16_FALSE (0)
16 | 
17 | typedef struct
18 | {
19 | 	// m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC16_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
20 | 	uint32_t m_max_partitions_mode1;
21 | 	
22 | 	// Relative RGBA or YCbCrA weights.
23 | 	uint32_t m_weights[4];
24 | 	
25 | 	// m_uber_level may range from 0 to BC7ENC16_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
26 | 	uint32_t m_uber_level;
27 | 
28 | 	// If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
29 | 	bc7enc16_bool m_perceptual;
30 | 
31 | 	// Set m_try_least_squares to false for slightly faster/lower quality compression.
32 | 	bc7enc16_bool m_try_least_squares;
33 | 	
34 | 	// When m_mode1_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful.
35 | 	// There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings.
36 | 	bc7enc16_bool m_mode1_partition_estimation_filterbank;
37 | 
38 | } bc7enc16_compress_block_params;
39 | 
40 | inline void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p)
41 | {
42 | 	p->m_perceptual = BC7ENC16_FALSE;
43 | 	p->m_weights[0] = 1;
44 | 	p->m_weights[1] = 1;
45 | 	p->m_weights[2] = 1;
46 | 	p->m_weights[3] = 1;
47 | }
48 | 
49 | inline void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p)
50 | {
51 | 	p->m_perceptual = BC7ENC16_TRUE;
52 | 	p->m_weights[0] = 128;
53 | 	p->m_weights[1] = 64;
54 | 	p->m_weights[2] = 16;
55 | 	p->m_weights[3] = 32;
56 | }
57 | 
58 | inline void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p)
59 | {
60 | 	p->m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1;
61 | 	p->m_try_least_squares = BC7ENC16_TRUE;
62 | 	p->m_mode1_partition_estimation_filterbank = BC7ENC16_TRUE;
63 | 	p->m_uber_level = 0;
64 | 	bc7enc16_compress_block_params_init_perceptual_weights(p);
65 | }
66 | 
67 | // bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts).
68 | void bc7enc16_compress_block_init();
69 | 
70 | // Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6.
71 | // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6.
72 | // Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.)
73 | bc7enc16_bool bc7enc16_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc16_compress_block_params *pComp_params);
74 | 
75 | #ifdef __cplusplus
76 | }
77 | #endif
78 | 


--------------------------------------------------------------------------------
/build_msvc.cmd:
--------------------------------------------------------------------------------
1 | cmake -G "Visual Studio 14 2015 Win64" .
2 | 


--------------------------------------------------------------------------------
/dds_defs.h:
--------------------------------------------------------------------------------
  1 | // File: dds_defs.h
  2 | // DX9 .DDS file header definitions.
  3 | #pragma once
  4 | 
  5 | #define PIXEL_FMT_FOURCC(a, b, c, d) ((a) | ((b) << 8U) | ((c) << 16U) | ((d) << 24U))
  6 | 
  7 | enum pixel_format
  8 | {
  9 | 	PIXEL_FMT_INVALID = 0,
 10 | 
 11 | 	PIXEL_FMT_DXT1 = PIXEL_FMT_FOURCC('D', 'X', 'T', '1'),
 12 | 	PIXEL_FMT_DXT2 = PIXEL_FMT_FOURCC('D', 'X', 'T', '2'),
 13 | 	PIXEL_FMT_DXT3 = PIXEL_FMT_FOURCC('D', 'X', 'T', '3'),
 14 | 	PIXEL_FMT_DXT4 = PIXEL_FMT_FOURCC('D', 'X', 'T', '4'),
 15 | 	PIXEL_FMT_DXT5 = PIXEL_FMT_FOURCC('D', 'X', 'T', '5'),
 16 | 	PIXEL_FMT_3DC = PIXEL_FMT_FOURCC('A', 'T', 'I', '2'), // DXN_YX
 17 | 	PIXEL_FMT_DXN = PIXEL_FMT_FOURCC('A', '2', 'X', 'Y'), // DXN_XY
 18 | 	PIXEL_FMT_DXT5A = PIXEL_FMT_FOURCC('A', 'T', 'I', '1'), // ATI1N, http://developer.amd.com/media/gpu_assets/Radeon_X1x00_Programming_Guide.pdf
 19 | 
 20 | 	// Non-standard formats (some of these are supported by ATI's Compressonator)
 21 | 	PIXEL_FMT_DXT5_CCxY = PIXEL_FMT_FOURCC('C', 'C', 'x', 'Y'),
 22 | 	PIXEL_FMT_DXT5_xGxR = PIXEL_FMT_FOURCC('x', 'G', 'x', 'R'),
 23 | 	PIXEL_FMT_DXT5_xGBR = PIXEL_FMT_FOURCC('x', 'G', 'B', 'R'),
 24 | 	PIXEL_FMT_DXT5_AGBR = PIXEL_FMT_FOURCC('A', 'G', 'B', 'R'),
 25 | 
 26 | 	PIXEL_FMT_DXT1A = PIXEL_FMT_FOURCC('D', 'X', '1', 'A'),
 27 | 	PIXEL_FMT_ETC1 = PIXEL_FMT_FOURCC('E', 'T', 'C', '1'),
 28 | 
 29 | 	PIXEL_FMT_R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'x'),
 30 | 	PIXEL_FMT_L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'x'),
 31 | 	PIXEL_FMT_A8 = PIXEL_FMT_FOURCC('x', 'x', 'x', 'A'),
 32 | 	PIXEL_FMT_A8L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'A'),
 33 | 	PIXEL_FMT_A8R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'A')
 34 | };
 35 | 
 36 | const uint32_t cDDSMaxImageDimensions = 8192U;
 37 | 
 38 | // Total size of header is sizeof(uint32)+cDDSSizeofDDSurfaceDesc2;
 39 | const uint32_t cDDSSizeofDDSurfaceDesc2 = 124;
 40 | 
 41 | // "DDS "
 42 | const uint32_t cDDSFileSignature = 0x20534444;
 43 | 
 44 | struct DDCOLORKEY
 45 | {
 46 | 	uint32_t dwUnused0;
 47 | 	uint32_t dwUnused1;
 48 | };
 49 | 
 50 | struct DDPIXELFORMAT
 51 | {
 52 | 	uint32_t dwSize;
 53 | 	uint32_t dwFlags;
 54 | 	uint32_t dwFourCC;
 55 | 	uint32_t dwRGBBitCount;     // ATI compressonator will place a FOURCC code here for swizzled/cooked DXTn formats
 56 | 	uint32_t dwRBitMask;
 57 | 	uint32_t dwGBitMask;
 58 | 	uint32_t dwBBitMask;
 59 | 	uint32_t dwRGBAlphaBitMask;
 60 | };
 61 | 
 62 | struct DDSCAPS2
 63 | {
 64 | 	uint32_t dwCaps;
 65 | 	uint32_t dwCaps2;
 66 | 	uint32_t dwCaps3;
 67 | 	uint32_t dwCaps4;
 68 | };
 69 | 
 70 | struct DDSURFACEDESC2
 71 | {
 72 | 	uint32_t dwSize;
 73 | 	uint32_t dwFlags;
 74 | 	uint32_t dwHeight;
 75 | 	uint32_t dwWidth;
 76 | 	union
 77 | 	{
 78 | 		int32_t lPitch;
 79 | 		uint32_t dwLinearSize;
 80 | 	};
 81 | 	uint32_t dwBackBufferCount;
 82 | 	uint32_t dwMipMapCount;
 83 | 	uint32_t dwAlphaBitDepth;
 84 | 	uint32_t dwUnused0;
 85 | 	uint32_t lpSurface;
 86 | 	DDCOLORKEY unused0;
 87 | 	DDCOLORKEY unused1;
 88 | 	DDCOLORKEY unused2;
 89 | 	DDCOLORKEY unused3;
 90 | 	DDPIXELFORMAT ddpfPixelFormat;
 91 | 	DDSCAPS2 ddsCaps;
 92 | 	uint32_t dwUnused1;
 93 | };
 94 | 
 95 | const uint32_t DDSD_CAPS = 0x00000001;
 96 | const uint32_t DDSD_HEIGHT = 0x00000002;
 97 | const uint32_t DDSD_WIDTH = 0x00000004;
 98 | const uint32_t DDSD_PITCH = 0x00000008;
 99 | 
100 | const uint32_t DDSD_BACKBUFFERCOUNT = 0x00000020;
101 | const uint32_t DDSD_ZBUFFERBITDEPTH = 0x00000040;
102 | const uint32_t DDSD_ALPHABITDEPTH = 0x00000080;
103 | 
104 | const uint32_t DDSD_LPSURFACE = 0x00000800;
105 | 
106 | const uint32_t DDSD_PIXELFORMAT = 0x00001000;
107 | const uint32_t DDSD_CKDESTOVERLAY = 0x00002000;
108 | const uint32_t DDSD_CKDESTBLT = 0x00004000;
109 | const uint32_t DDSD_CKSRCOVERLAY = 0x00008000;
110 | 
111 | const uint32_t DDSD_CKSRCBLT = 0x00010000;
112 | const uint32_t DDSD_MIPMAPCOUNT = 0x00020000;
113 | const uint32_t DDSD_REFRESHRATE = 0x00040000;
114 | const uint32_t DDSD_LINEARSIZE = 0x00080000;
115 | 
116 | const uint32_t DDSD_TEXTURESTAGE = 0x00100000;
117 | const uint32_t DDSD_FVF = 0x00200000;
118 | const uint32_t DDSD_SRCVBHANDLE = 0x00400000;
119 | const uint32_t DDSD_DEPTH = 0x00800000;
120 | 
121 | const uint32_t DDSD_ALL = 0x00fff9ee;
122 | 
123 | const uint32_t DDPF_ALPHAPIXELS = 0x00000001;
124 | const uint32_t DDPF_ALPHA = 0x00000002;
125 | const uint32_t DDPF_FOURCC = 0x00000004;
126 | const uint32_t DDPF_PALETTEINDEXED8 = 0x00000020;
127 | const uint32_t DDPF_RGB = 0x00000040;
128 | const uint32_t DDPF_LUMINANCE = 0x00020000;
129 | 
130 | const uint32_t DDSCAPS_COMPLEX = 0x00000008;
131 | const uint32_t DDSCAPS_TEXTURE = 0x00001000;
132 | const uint32_t DDSCAPS_MIPMAP = 0x00400000;
133 | 
134 | const uint32_t DDSCAPS2_CUBEMAP = 0x00000200;
135 | const uint32_t DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400;
136 | const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800;
137 | 
138 | const uint32_t DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000;
139 | const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000;
140 | const uint32_t DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000;
141 | const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000;
142 | 
143 | const uint32_t DDSCAPS2_VOLUME = 0x00200000;
144 | 
145 | typedef enum DXGI_FORMAT 
146 | {
147 | 	DXGI_FORMAT_UNKNOWN = 0,
148 | 	DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
149 | 	DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
150 | 	DXGI_FORMAT_R32G32B32A32_UINT = 3,
151 | 	DXGI_FORMAT_R32G32B32A32_SINT = 4,
152 | 	DXGI_FORMAT_R32G32B32_TYPELESS = 5,
153 | 	DXGI_FORMAT_R32G32B32_FLOAT = 6,
154 | 	DXGI_FORMAT_R32G32B32_UINT = 7,
155 | 	DXGI_FORMAT_R32G32B32_SINT = 8,
156 | 	DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
157 | 	DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
158 | 	DXGI_FORMAT_R16G16B16A16_UNORM = 11,
159 | 	DXGI_FORMAT_R16G16B16A16_UINT = 12,
160 | 	DXGI_FORMAT_R16G16B16A16_SNORM = 13,
161 | 	DXGI_FORMAT_R16G16B16A16_SINT = 14,
162 | 	DXGI_FORMAT_R32G32_TYPELESS = 15,
163 | 	DXGI_FORMAT_R32G32_FLOAT = 16,
164 | 	DXGI_FORMAT_R32G32_UINT = 17,
165 | 	DXGI_FORMAT_R32G32_SINT = 18,
166 | 	DXGI_FORMAT_R32G8X24_TYPELESS = 19,
167 | 	DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
168 | 	DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
169 | 	DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
170 | 	DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
171 | 	DXGI_FORMAT_R10G10B10A2_UNORM = 24,
172 | 	DXGI_FORMAT_R10G10B10A2_UINT = 25,
173 | 	DXGI_FORMAT_R11G11B10_FLOAT = 26,
174 | 	DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
175 | 	DXGI_FORMAT_R8G8B8A8_UNORM = 28,
176 | 	DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
177 | 	DXGI_FORMAT_R8G8B8A8_UINT = 30,
178 | 	DXGI_FORMAT_R8G8B8A8_SNORM = 31,
179 | 	DXGI_FORMAT_R8G8B8A8_SINT = 32,
180 | 	DXGI_FORMAT_R16G16_TYPELESS = 33,
181 | 	DXGI_FORMAT_R16G16_FLOAT = 34,
182 | 	DXGI_FORMAT_R16G16_UNORM = 35,
183 | 	DXGI_FORMAT_R16G16_UINT = 36,
184 | 	DXGI_FORMAT_R16G16_SNORM = 37,
185 | 	DXGI_FORMAT_R16G16_SINT = 38,
186 | 	DXGI_FORMAT_R32_TYPELESS = 39,
187 | 	DXGI_FORMAT_D32_FLOAT = 40,
188 | 	DXGI_FORMAT_R32_FLOAT = 41,
189 | 	DXGI_FORMAT_R32_UINT = 42,
190 | 	DXGI_FORMAT_R32_SINT = 43,
191 | 	DXGI_FORMAT_R24G8_TYPELESS = 44,
192 | 	DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
193 | 	DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
194 | 	DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
195 | 	DXGI_FORMAT_R8G8_TYPELESS = 48,
196 | 	DXGI_FORMAT_R8G8_UNORM = 49,
197 | 	DXGI_FORMAT_R8G8_UINT = 50,
198 | 	DXGI_FORMAT_R8G8_SNORM = 51,
199 | 	DXGI_FORMAT_R8G8_SINT = 52,
200 | 	DXGI_FORMAT_R16_TYPELESS = 53,
201 | 	DXGI_FORMAT_R16_FLOAT = 54,
202 | 	DXGI_FORMAT_D16_UNORM = 55,
203 | 	DXGI_FORMAT_R16_UNORM = 56,
204 | 	DXGI_FORMAT_R16_UINT = 57,
205 | 	DXGI_FORMAT_R16_SNORM = 58,
206 | 	DXGI_FORMAT_R16_SINT = 59,
207 | 	DXGI_FORMAT_R8_TYPELESS = 60,
208 | 	DXGI_FORMAT_R8_UNORM = 61,
209 | 	DXGI_FORMAT_R8_UINT = 62,
210 | 	DXGI_FORMAT_R8_SNORM = 63,
211 | 	DXGI_FORMAT_R8_SINT = 64,
212 | 	DXGI_FORMAT_A8_UNORM = 65,
213 | 	DXGI_FORMAT_R1_UNORM = 66,
214 | 	DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
215 | 	DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
216 | 	DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
217 | 	DXGI_FORMAT_BC1_TYPELESS = 70,
218 | 	DXGI_FORMAT_BC1_UNORM = 71,
219 | 	DXGI_FORMAT_BC1_UNORM_SRGB = 72,
220 | 	DXGI_FORMAT_BC2_TYPELESS = 73,
221 | 	DXGI_FORMAT_BC2_UNORM = 74,
222 | 	DXGI_FORMAT_BC2_UNORM_SRGB = 75,
223 | 	DXGI_FORMAT_BC3_TYPELESS = 76,
224 | 	DXGI_FORMAT_BC3_UNORM = 77,
225 | 	DXGI_FORMAT_BC3_UNORM_SRGB = 78,
226 | 	DXGI_FORMAT_BC4_TYPELESS = 79,
227 | 	DXGI_FORMAT_BC4_UNORM = 80,
228 | 	DXGI_FORMAT_BC4_SNORM = 81,
229 | 	DXGI_FORMAT_BC5_TYPELESS = 82,
230 | 	DXGI_FORMAT_BC5_UNORM = 83,
231 | 	DXGI_FORMAT_BC5_SNORM = 84,
232 | 	DXGI_FORMAT_B5G6R5_UNORM = 85,
233 | 	DXGI_FORMAT_B5G5R5A1_UNORM = 86,
234 | 	DXGI_FORMAT_B8G8R8A8_UNORM = 87,
235 | 	DXGI_FORMAT_B8G8R8X8_UNORM = 88,
236 | 	DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89,
237 | 	DXGI_FORMAT_B8G8R8A8_TYPELESS = 90,
238 | 	DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
239 | 	DXGI_FORMAT_B8G8R8X8_TYPELESS = 92,
240 | 	DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
241 | 	DXGI_FORMAT_BC6H_TYPELESS = 94,
242 | 	DXGI_FORMAT_BC6H_UF16 = 95,
243 | 	DXGI_FORMAT_BC6H_SF16 = 96,
244 | 	DXGI_FORMAT_BC7_TYPELESS = 97,
245 | 	DXGI_FORMAT_BC7_UNORM = 98,
246 | 	DXGI_FORMAT_BC7_UNORM_SRGB = 99,
247 | 	DXGI_FORMAT_AYUV = 100,
248 | 	DXGI_FORMAT_Y410 = 101,
249 | 	DXGI_FORMAT_Y416 = 102,
250 | 	DXGI_FORMAT_NV12 = 103,
251 | 	DXGI_FORMAT_P010 = 104,
252 | 	DXGI_FORMAT_P016 = 105,
253 | 	DXGI_FORMAT_420_OPAQUE = 106,
254 | 	DXGI_FORMAT_YUY2 = 107,
255 | 	DXGI_FORMAT_Y210 = 108,
256 | 	DXGI_FORMAT_Y216 = 109,
257 | 	DXGI_FORMAT_NV11 = 110,
258 | 	DXGI_FORMAT_AI44 = 111,
259 | 	DXGI_FORMAT_IA44 = 112,
260 | 	DXGI_FORMAT_P8 = 113,
261 | 	DXGI_FORMAT_A8P8 = 114,
262 | 	DXGI_FORMAT_B4G4R4A4_UNORM = 115,
263 | 	DXGI_FORMAT_P208 = 130,
264 | 	DXGI_FORMAT_V208 = 131,
265 | 	DXGI_FORMAT_V408 = 132,
266 | 	DXGI_FORMAT_FORCE_UINT = 0xffffffff
267 | } DXGI_FORMAT;
268 | 
269 | enum D3D10_RESOURCE_DIMENSION 
270 | {
271 | 	D3D10_RESOURCE_DIMENSION_UNKNOWN = 0,
272 | 	D3D10_RESOURCE_DIMENSION_BUFFER = 1,
273 | 	D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2,
274 | 	D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3,
275 | 	D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4
276 | };
277 | 
278 | struct DDS_HEADER_DXT10
279 | {
280 | 	DXGI_FORMAT              dxgiFormat;
281 | 	D3D10_RESOURCE_DIMENSION resourceDimension;
282 | 	uint32_t                 miscFlag;
283 | 	uint32_t                 arraySize;
284 | 	uint32_t                 miscFlags2;
285 | };
286 | 
287 | 


--------------------------------------------------------------------------------
/ktx_defs.h:
--------------------------------------------------------------------------------
 1 | // File: ktx_defs.h
 2 | // .KTX file header definitions.
 3 | // -> https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/
 4 | #pragma once
 5 | 
 6 | 
 7 | // OpenGL constants
 8 | #define GL_RGB 0x1907
 9 | #define GL_RGBA 0x1908
10 | #define GL_COMPRESSED_RGBA_BPTC_UNORM 0x8E8C
11 | #define GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM 0x8E8D
12 | 
13 | 
14 | constexpr char const * ktxFileNameExt = ".ktx";
15 | 
16 | static constexpr uint8_t ktxFileIdentifier[12] =
17 | {
18 |    0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A
19 | };
20 | 
21 | static constexpr uint32_t ktxEndianess = 0x04030201;
22 | 
23 | static constexpr uint8_t ktxOrientation[] =
24 | {
25 |     'K','T','X','o','r','i','e','n','t','a','t','i','o','n',0,
26 |     'S','=','r',',','T','=','d',0
27 | };
28 | 
29 | 
30 | struct KTX_HEADER
31 | {
32 | 	uint8_t identifier[12];            // 0x00
33 | 	uint32_t endianness;               // 0x0c
34 | 	uint32_t glType;                   // 0x10
35 | 	uint32_t glTypeSize;
36 | 	uint32_t glFormat;                 // 0x18
37 | 	uint32_t glInternalFormat;
38 | 	uint32_t glBaseInternalFormat;     // 0x20
39 | 	uint32_t pixelWidth;
40 | 	uint32_t pixelHeight;              // 0x28
41 | 	uint32_t pixelDepth;
42 | 	uint32_t numberOfArrayElements;    // 0x30
43 | 	uint32_t numberOfFaces;
44 | 	uint32_t numberOfMipmapLevels;     // 0x38
45 | 	uint32_t bytesOfKeyValueData;      // 0x3c
46 | };
47 | 


--------------------------------------------------------------------------------