├── CMakeLists.txt ├── LICENSE ├── README ├── bc7decomp.c ├── bc7decomp.h ├── bc7enc.cpp ├── bc7enc16.c ├── bc7enc16.h ├── build_msvc.cmd ├── dds_defs.h ├── ktx_defs.h ├── lodepng.cpp └── lodepng.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(bc7enc) 2 | 3 | cmake_minimum_required(VERSION 2.8) 4 | option(BUILD_X64 "build 64-bit" TRUE) 5 | 6 | message("Initial BUILD_X64=${BUILD_X64}") 7 | message("Initial CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") 8 | 9 | if( NOT CMAKE_BUILD_TYPE ) 10 | set( CMAKE_BUILD_TYPE Release ) 11 | endif( NOT CMAKE_BUILD_TYPE ) 12 | 13 | message( ${PROJECT_NAME} " build type: " ${CMAKE_BUILD_TYPE} ) 14 | 15 | if (BUILD_X64) 16 | message("Building 64-bit") 17 | else() 18 | message("Building 32-bit") 19 | endif(BUILD_X64) 20 | 21 | if (NOT MSVC) 22 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") 23 | set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g") 24 | 25 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") 26 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") 27 | endif() 28 | 29 | # -fno-strict-aliasing shouldn't be necessary, it's here because that is what MSVC uses by default and that's what I've tested with the most. 30 | if (NOT MSVC) 31 | set(GCC_COMPILE_FLAGS "-fno-strict-aliasing -Wall -Wextra") 32 | if (NOT BUILD_X64) 33 | set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -m32") 34 | endif() 35 | endif() 36 | 37 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}") 38 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}") 39 | set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG") 40 | 41 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GCC_COMPILE_FLAGS}") 42 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}") 43 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG") 44 | 45 | set (BC7ENC_SRC_LIST ${COMMON_SRC_LIST} 46 | bc7enc.cpp 47 | lodepng.cpp 48 | bc7decomp.c 49 | bc7enc16.c 50 | ) 51 | 52 | add_executable(bc7enc ${BC7ENC_SRC_LIST}) 53 | 54 | if (NOT MSVC) 55 | target_link_libraries(bc7enc m) 56 | endif() 57 | 58 | 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | bc7enc16.c/.h is available under 2 licenses -- choose whichever you prefer: 2 | 3 | ALTERNATIVE A for bc7enc.c/.h - MIT License 4 | Copyright(c) 2018 Richard Geldreich, Jr. 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files(the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 9 | of the Software, and to permit persons to whom the Software is furnished to do 10 | so, subject to the following conditions : 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | ------------------------------------------------------------------------------ 21 | ALTERNATIVE B for bc7enc.c/.h - Public Domain(www.unlicense.org) 22 | This is free and unencumbered software released into the public domain. 23 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 24 | software, either in source code form or as a compiled binary, for any purpose, 25 | commercial or non - commercial, and by any means. 26 | In jurisdictions that recognize copyright laws, the author or authors of this 27 | software dedicate any and all copyright interest in the software to the public 28 | domain.We make this dedication for the benefit of the public at large and to 29 | the detriment of our heirs and successors.We intend this dedication to be an 30 | overt act of relinquishment in perpetuity of all present and future rights to 31 | this software under copyright law. 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 35 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 37 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 38 | ------------------------------------------------------------------------------ 39 | 40 | bc7decomp.c/.h: Copyright (c) 2015 Harm Hanemaaijer 41 | Permission to use, copy, modify, and/or distribute this software for any 42 | purpose with or without fee is hereby granted, provided that the above 43 | copyright notice and this permission notice appear in all copies. 44 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 45 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 46 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 47 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 48 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 49 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 50 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 51 | 52 | ------------------------------------------------------------------------------ 53 | 54 | LodePNG version 20161127 55 | 56 | Copyright (c) 2005-2016 Lode Vandevenne 57 | 58 | This software is provided 'as-is', without any express or implied 59 | warranty. In no event will the authors be held liable for any damages 60 | arising from the use of this software. 61 | 62 | Permission is granted to anyone to use this software for any purpose, 63 | including commercial applications, and to alter it and redistribute it 64 | freely, subject to the following restrictions: 65 | 66 | 1. The origin of this software must not be misrepresented; you must not 67 | claim that you wrote the original software. If you use this software 68 | in a product, an acknowledgment in the product documentation would be 69 | appreciated but is not required. 70 | 71 | 2. Altered source versions must be plainly marked as such, and must not be 72 | misrepresented as being the original software. 73 | 74 | 3. This notice may not be removed or altered from any source 75 | distribution. 76 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | bc7enc16 - Fast, single source file BC7/BPTC GPU texture encoder with perceptual colorspace metric support 2 | 3 | Note: Since this repo was created, we've released two new codecs with better BC7 encoders: 4 | https://github.com/richgel999/bc7enc_rdo 5 | https://github.com/BinomialLLC/bc7e 6 | 7 | bc7enc16 purposely only supports modes 1 and 6. This is a strong opaque texture encoder, with basic 8 | support for alpha channels (using mode 6). The intended use case is opaque textures, or opaque textures 9 | with relatively simple alpha channels. It also acts as a relatively simple to understand example. 10 | 11 | If alpha is highly correlated compared to RGB, or alpha is relatively simple 12 | (think simple masks where lots of blocks are either all-transparent or 13 | all-opaque), it should work great. For complex alpha channels more modes (such 14 | as 4, 5 or maybe 7) are necessary. 15 | 16 | This codec supports a perceptual mode, where it computes colorspace error in 17 | weighted YCbCr space (like etc2comp), and it also supports weighted RGBA 18 | metrics. It's particular strong in perceptual mode, beating the current state of 19 | the art CPU encoder (Intel's ispc_texcomp) by a wide margin when measured by 20 | Luma PSNR, even though it only supports 2 modes and isn't vectorized. 21 | 22 | Why only modes 1 and 6? 23 | Because with these two modes you have a complete encoder that supports both 24 | opaque and transparent textures in a small amount (~1400 lines) of 25 | understandable plain C code. Mode 6 excels on smooth blocks, and mode 1 is 26 | strong with complex blocks, and a strong encoder that combines both modes can be 27 | quite high quality. Fast mode 6-only encoders will have noticeable block 28 | artifacts which this codec avoids by fully supporting mode 1. 29 | 30 | Modes 1 and 6 are typically the most used modes on many textures using other 31 | encoders. Mode 1 has two subsets, 64 possible partitions, and 3-bit indices, 32 | while mode 6 has large 4-bit indices and high precision 7777.1 endpoints. This 33 | codec produces output that is far higher quality than any BC1 encoder, and 34 | approaches (or in perceptual mode exceeds!) the quality of other full BC7 35 | encoders. 36 | 37 | Why is bc7enc16 so fast in perceptual mode? 38 | Computing error in YCbCr space is more expensive than in RGB space, yet bc7enc16 39 | in perceptual mode is stronger than ispc_texcomp (see the benchmark below) - 40 | even without SSE/AVX vectorization and with only 2 modes to work with! 41 | 42 | Most BC7 encoders only support linear RGB colorspace metrics, which is a 43 | fundamental weakness. Some support weighted RGB metrics, which is better. With 44 | linear RGB metrics, encoding error is roughly balanced between each channel, and 45 | encoders have to work *very* hard (examining large amounts of RGB search space) 46 | to get overall quality up. With perceptual colorspace metrics, RGB error tends 47 | to become a bit unbalanced, with green quality favored more highly than red and 48 | blue, and blue quality favored the least. A perceptual encoder is tuned to 49 | prefer exploring solutions along the luma axis, where it's much less work to find 50 | solutions with less luma error. bc7enc16 is, as far as I know, the first BC7 51 | codec to support computing error in weighted YCbCr colorspace. 52 | 53 | Note: Most of the timings here (except for the ispc_texcomp "fast" mode timings at the very bottom) 54 | are for the *original* release, before I added several more optimizations. The latest version of 55 | bc7enc16.c is around 8-27% faster than the initial release at same quality (when mode 1 is enabled - 56 | there's no change with just mode 6). 57 | 58 | Some benchmarks across 31 images (kodim corpus+others): 59 | 60 | Perceptual (average REC709 Luma PSNR - higher is better quality): 61 | 62 | iscp_texcomp slow vs. bc7enc16 uber4/max_partitions 64 63 | iscp_texcomp: 355.4 secs 48.6 dB 64 | bc7enc16: 122.6 secs 50.0 dB 65 | 66 | iscp_texcomp slow vs. bc7enc16 uber0/max_partitions 64 67 | iscp_texcomp: 355.4 secs 48.6 dB 68 | bc7enc16: 38.3 secs 49.6 dB 69 | 70 | iscp_texcomp basic vs. bc7enc16 uber0/max_partitions 16 71 | ispc_texcomp: 100.2 secs 48.3 dB 72 | bc7enc16: 20.8 secs 49.3 dB 73 | 74 | iscp_texcomp fast vs. bc7enc16 uber0/max_partitions 16 75 | iscp_texcomp: 41.5 secs 48.0 dB 76 | bc7enc16: 20.8 secs 49.3 dB 77 | 78 | iscp_texcomp ultrafast vs. bc7enc16 uber0/max_partitions 0 79 | iscp_texcomp: 1.9 secs 46.2 dB 80 | bc7enc16: 8.9 secs 48.4 dB 81 | 82 | Non-perceptual (average RGB PSNR): 83 | 84 | iscp_texcomp slow vs. bc7enc16 uber4/max_partitions 64 85 | iscp_texcomp: 355.4 secs 46.8 dB 86 | bc7enc16: 51 secs 46.1 dB 87 | 88 | iscp_texcomp slow vs. bc7enc16 uber0/max_partitions 64 89 | iscp_texcomp: 355.4 secs 46.8 dB 90 | bc7enc16: 29.3 secs 45.8 dB 91 | 92 | iscp_texcomp basic vs. bc7enc16 uber4/max_partitions 64 93 | iscp_texcomp: 99.9 secs 46.5 dB 94 | bc7enc16: 51 secs 46.1 dB 95 | 96 | iscp_texcomp fast vs. bc7enc16 uber1/max_partitions 16 97 | ispc_texcomp: 41.5 secs 46.1 dB 98 | bc7enc16: 19.8 secs 45.5 dB 99 | 100 | iscp_texcomp fast vs. bc7enc16 uber0/max_partitions 8 101 | ispc_texcomp: 41.5 secs 46.1 dB 102 | bc7enc16: 10.46 secs 44.4 dB 103 | 104 | iscp_texcomp ultrafast vs. bc7enc16 uber0/max_partitions 0 105 | ispc_texcomp: 1.9 secs 42.7 dB 106 | bc7enc16: 3.8 secs 42.7 dB 107 | 108 | DirectXTex CPU in "mode 6 only" mode vs. bc7enc16 uber1/max_partions 0 (mode 6 only), non-perceptual: 109 | 110 | DirectXTex: 466.4 secs 41.9 dB 111 | bc7enc16: 6.7 secs 42.8 dB 112 | 113 | DirectXTex CPU in (default - no 3 subset modes) vs. bc7enc16 uber1/max_partions 64, non-perceptual: 114 | 115 | DirectXTex: 9485.1 secs 45.6 dB 116 | bc7enc16: 36 secs 46.0 dB 117 | 118 | (Note this version of DirectXTex has a key pbit bugfix which I've submitted but 119 | is still waiting to be accepted. Non-bugfixed versions will be slightly lower 120 | quality.) 121 | 122 | UPDATE: To illustrate how strong the mode 1+6 implementation is in bc7enc16, let's compare ispc_texcomp 123 | fast vs. the latest version of bc7enc16 uber4/max_partitions 64: 124 | 125 | Without filterbank optimizations: 126 | 127 | Time RGB PSNR Y PSNR 128 | ispc_texcomp: 41.45 secs 46.09 dB 48.0 dB 129 | bc7enc16: 41.42 secs 46.03 dB 48.2 dB 130 | 131 | With filterbank optimizations enabled: 132 | bc7enc16: 38.78 secs 45.94 dB 48.12 dB 133 | 134 | They both have virtually the same average RGB PSNR with these settings (.06 dB is basically noise), but 135 | bc7enc16 is just as fast as ispc_texcomp fast, even though it's not vectorized. Interestingly, our Y PSNR is better, 136 | although bc7enc16 wasn't using perceptual metrics in these benchmarks. 137 | 138 | This was a multithreaded benchmark (using OpenMP) on a dual Xeon workstation. 139 | ispc_texcomp was called with 64-blocks at a time and used AVX instructions. 140 | Timings are for encoding only. 141 | -------------------------------------------------------------------------------- /bc7decomp.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015 Harm Hanemaaijer 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 7 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 8 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 9 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 10 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 11 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 12 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 13 | */ 14 | 15 | // Modified by Rich Geldreich 4/26/18- fixed bugs in detexBlock128ExtractBits() and FullyDecodeEndpoints(), 16 | // compared vs. DirectXTex'c BC7 decoder for correctness. 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "bc7decomp.h" 23 | 24 | // Integer division using look-up tables, used by BC1/2/3 and RGTC (BC4/5) 25 | // decompression. 26 | 27 | typedef struct { 28 | uint64_t data0; 29 | uint64_t data1; 30 | int index; 31 | } detexBlock128; 32 | 33 | uint32_t detexBlock128ExtractBits(detexBlock128 *block, int nu_bits) { 34 | uint32_t value = 0; 35 | for (int i = 0; i < nu_bits; i++) { 36 | if (block->index < 64) { 37 | int shift = block->index - i; 38 | if (shift < 0) 39 | value |= (block->data0 & ((uint64_t)1 << block->index)) << (-shift); 40 | else 41 | value |= (block->data0 & ((uint64_t)1 << block->index)) >> shift; 42 | } 43 | else { 44 | int shift = ((block->index - 64) - i); 45 | if (shift < 0) 46 | value |= (block->data1 & ((uint64_t)1 << (block->index - 64))) << (-shift); 47 | else 48 | value |= (block->data1 & ((uint64_t)1 << (block->index - 64))) >> shift; 49 | } 50 | block->index++; 51 | } 52 | // if (block->index > 128) 53 | // printf("Block overflow (%d)\n", block->index); 54 | return value; 55 | } 56 | 57 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetR8(uint32_t pixel) { 58 | return pixel & 0xFF; 59 | } 60 | 61 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetG8(uint32_t pixel) { 62 | return (pixel & 0xFF00) >> 8; 63 | } 64 | 65 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetB8(uint32_t pixel) { 66 | return (pixel & 0xFF0000) >> 16; 67 | } 68 | 69 | static DETEX_INLINE_ONLY uint32_t detexPixel32GetA8(uint32_t pixel) { 70 | return (pixel & 0xFF000000) >> 24; 71 | } 72 | 73 | static DETEX_INLINE_ONLY uint32_t detexPack32R8(int r) { 74 | return (uint32_t)r; 75 | } 76 | 77 | static DETEX_INLINE_ONLY uint32_t detexPack32G8(int g) { 78 | return (uint32_t)g << 8; 79 | } 80 | 81 | static DETEX_INLINE_ONLY uint32_t detexPack32B8(int b) { 82 | return (uint32_t)b << 16; 83 | } 84 | 85 | static DETEX_INLINE_ONLY uint32_t detexPack32A8(int a) { 86 | return (uint32_t)a << 24; 87 | } 88 | 89 | static DETEX_INLINE_ONLY uint32_t detexPack32RGBA8(int r, int g, int b, int a) { 90 | return (uint32_t)r | ((uint32_t)g << 8) | ((uint32_t)b << 16) | 91 | ((uint32_t)a << 24); 92 | } 93 | 94 | uint32_t detexBlock128ExtractBits(detexBlock128 *block, int nu_bits); 95 | 96 | /* Return bitfield from bit0 to bit1 from 64-bit bitstring. */ 97 | static DETEX_INLINE_ONLY uint32_t detexGetBits64(uint64_t data, int bit0, int bit1) { 98 | uint64_t mask; 99 | if (bit1 == 63) 100 | mask = UINT64_MAX; 101 | else 102 | mask = ((uint64_t)1 << (bit1 + 1)) - 1; 103 | 104 | return (uint32_t)((data & mask) >> bit0); 105 | } 106 | 107 | const uint8_t detex_bptc_table_P2[64 * 16] = { 108 | 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 109 | 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 110 | 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 111 | 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 112 | 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 113 | 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 114 | 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 115 | 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, 116 | 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 117 | 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 118 | 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 119 | 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 120 | 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 121 | 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 122 | 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 123 | 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, 124 | 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 125 | 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 126 | 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 127 | 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 128 | 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 129 | 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 130 | 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 131 | 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, 132 | 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 133 | 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 134 | 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 135 | 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 136 | 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 137 | 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 138 | 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 139 | 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, 140 | 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 141 | 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 142 | 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 143 | 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 144 | 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 145 | 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 146 | 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 147 | 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, 148 | 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 149 | 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 150 | 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 151 | 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 152 | 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 153 | 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 154 | 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 155 | 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, 156 | 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 157 | 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 158 | 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 159 | 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 160 | 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 161 | 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 162 | 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 163 | 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, 164 | 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 165 | 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 166 | 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 167 | 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 168 | 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 169 | 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 170 | 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 171 | 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 172 | }; 173 | 174 | const uint8_t detex_bptc_table_P3[64 * 16] = { 175 | 0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2, 176 | 0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1, 177 | 0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1, 178 | 0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1, 179 | 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2, 180 | 0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2, 181 | 0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1, 182 | 0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1, 183 | 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2, 184 | 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2, 185 | 0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2, 186 | 0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2, 187 | 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2, 188 | 0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2, 189 | 0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2, 190 | 0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0, 191 | 0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2, 192 | 0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0, 193 | 0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2, 194 | 0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1, 195 | 0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2, 196 | 0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1, 197 | 0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2, 198 | 0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0, 199 | 0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0, 200 | 0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2, 201 | 0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0, 202 | 0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1, 203 | 0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2, 204 | 0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2, 205 | 0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1, 206 | 0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1, 207 | 0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2, 208 | 0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1, 209 | 0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2, 210 | 0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0, 211 | 0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0, 212 | 0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0, 213 | 0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0, 214 | 0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1, 215 | 0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1, 216 | 0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2, 217 | 0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1, 218 | 0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2, 219 | 0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1, 220 | 0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1, 221 | 0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1, 222 | 0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1, 223 | 0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2, 224 | 0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1, 225 | 0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2, 226 | 0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2, 227 | 0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2, 228 | 0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2, 229 | 0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2, 230 | 0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2, 231 | 0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2, 232 | 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2, 233 | 0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2, 234 | 0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2, 235 | 0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1, 236 | 0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2, 237 | 0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2, 238 | 0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0, 239 | }; 240 | 241 | const uint8_t detex_bptc_table_anchor_index_second_subset[64] = { 242 | 15,15,15,15,15,15,15,15, 243 | 15,15,15,15,15,15,15,15, 244 | 15, 2, 8, 2, 2, 8, 8,15, 245 | 2, 8, 2, 2, 8, 8, 2, 2, 246 | 15,15, 6, 8, 2, 8,15,15, 247 | 2, 8, 2, 2, 2,15,15, 6, 248 | 6, 2, 6, 8,15,15, 2, 2, 249 | 15,15,15,15,15, 2, 2,15 250 | }; 251 | 252 | const uint8_t detex_bptc_table_anchor_index_second_subset_of_three[64] = { 253 | 3, 3,15,15, 8, 3,15,15, 254 | 8, 8, 6, 6, 6, 5, 3, 3, 255 | 3, 3, 8,15, 3, 3, 6,10, 256 | 5, 8, 8, 6, 8, 5,15,15, 257 | 8,15, 3, 5, 6,10, 8,15, 258 | 15, 3,15, 5,15,15,15,15, 259 | 3,15, 5, 5, 5, 8, 5,10, 260 | 5,10, 8,13,15,12, 3, 3 261 | }; 262 | 263 | const uint8_t detex_bptc_table_anchor_index_third_subset[64] = { 264 | 15, 8, 8, 3,15,15, 3, 8, 265 | 15,15,15,15,15,15,15, 8, 266 | 15, 8,15, 3,15, 8,15, 8, 267 | 3,15, 6,10,15,15,10, 8, 268 | 15, 3,15,10,10, 8, 9,10, 269 | 6,15, 8,15, 3, 6, 6, 8, 270 | 15, 3,15,15,15,15,15,15, 271 | 15,15,15,15, 3,15,15, 8 272 | }; 273 | 274 | const uint16_t detex_bptc_table_aWeight2[4] = { 275 | 0, 21, 43, 64 276 | }; 277 | 278 | const uint16_t detex_bptc_table_aWeight3[8] = { 279 | 0, 9, 18, 27, 37, 46, 55, 64 280 | }; 281 | 282 | const uint16_t detex_bptc_table_aWeight4[16] = { 283 | 0, 4, 9, 13, 17, 21, 26, 30, 284 | 34, 38, 43, 47, 51, 55, 60, 64 285 | }; 286 | 287 | 288 | 289 | // BPTC mode layout: 290 | // 291 | // Number of subsets = { 3, 2, 3, 2, 1, 1, 1, 2 }; 292 | // Partition bits = { 4, 6, 6, 6, 0, 0, 0, 6 }; 293 | // Rotation bits = { 0, 0, 0, 0, 2, 2, 0, 0 }; 294 | // Mode 4 has one index selection bit. 295 | // 296 | // #subsets color alpha before color index after color index after After Index 297 | // alpha pbits bits (*) 298 | // Mode 0 3 4 0 1 + 4 = 5 5 + 6 * 3 * 4 = 77 77 + 6 = 83 + 48 - 3 = 128 299 | // Mode 1 2 6 0 2 + 6 = 8 8 + 4 * 3 * 6 = 80 80 + 2 = 82 + 48 - 2 = 128 300 | // Mode 2 3 5 0 3 + 6 = 9 9 + 6 * 3 * 5 = 99 99 99 + 32 - 3 = 128 301 | // Mode 3 2 7 0 4 + 6 = 10 10 + 4 * 3 * 7 = 94 94 + 4 = 98 + 32 - 2 = 128 302 | // Mode 4 1 5 6 5 + 2 + 1 = 8 8 + 2 * 3 * 5 = 38 37 + 2 * 6 = 50 50 + 80 - 2 = 128 303 | // Mode 5 1 7 8 6 + 2 = 8 8 + 2 * 3 * 7 = 50 50 + 2 * 8 = 66 66 + 64 - 2 = 128 304 | // Mode 6 1 7 7 7 7 + 2 * 3 * 7 = 49 49 + 2 * 7 = 63 + 2 = 65 + 64 - 1 = 128 305 | // Mode 7 2 5 5 8 + 6 = 14 14 + 4 * 3 * 5 = 74 74 + 4 * 5 = 94 + 4 = 98 + 32 - 2 = 128 306 | // 307 | // (*) For formats without alpha, the number of index bits is reduced by #subsets anchor bits. 308 | // For formats with alpha, the number of index bits is reduced by 2 * #subsets by the anchor bits. 309 | 310 | 311 | static const uint8_t color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 }; 312 | 313 | // Note: precision includes P-bits! 314 | static const uint8_t color_precision_plus_pbit_table[8] = { 5, 7, 5, 8, 5, 7, 8, 6 }; 315 | 316 | static DETEX_INLINE_ONLY uint8_t GetColorComponentPrecision(int mode) { 317 | return color_precision_table[mode]; 318 | } 319 | 320 | static DETEX_INLINE_ONLY uint8_t GetColorComponentPrecisionPlusPbit(int mode) { 321 | return color_precision_plus_pbit_table[mode]; 322 | } 323 | 324 | static const int8_t alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 }; 325 | 326 | // Note: precision include P-bits! 327 | static const uint8_t alpha_precision_plus_pbit_table[8] = { 0, 0, 0, 0, 6, 8, 8, 6 }; 328 | 329 | static DETEX_INLINE_ONLY uint8_t GetAlphaComponentPrecision(int mode) { 330 | return alpha_precision_table[mode]; 331 | } 332 | 333 | static DETEX_INLINE_ONLY uint8_t GetAlphaComponentPrecisionPlusPbit(int mode) { 334 | return alpha_precision_plus_pbit_table[mode]; 335 | } 336 | 337 | static const int8_t components_in_qword0_table[8] = { 2, -1, 1, 1, 3, 3, 3, 2 }; 338 | 339 | /* Extract endpoint colors. */ 340 | static void ExtractEndpoints(int mode, int nu_subsets, detexBlock128 * DETEX_RESTRICT block, 341 | uint8_t * DETEX_RESTRICT endpoint_array) { 342 | // Optimized version avoiding the use of block_extract_bits(). 343 | int components_in_qword0 = components_in_qword0_table[mode]; 344 | uint64_t data = block->data0 >> block->index; 345 | uint8_t precision = GetColorComponentPrecision(mode); 346 | uint8_t mask = (1 << precision) - 1; 347 | int total_bits_per_component = nu_subsets * 2 * precision; 348 | for (int i = 0; i < components_in_qword0; i++) // For each color component. 349 | for (int j = 0; j < nu_subsets; j++) // For each subset. 350 | for (int k = 0; k < 2; k++) { // For each endpoint. 351 | endpoint_array[j * 8 + k * 4 + i] = data & mask; 352 | data >>= precision; 353 | } 354 | block->index += components_in_qword0 * total_bits_per_component; 355 | if (components_in_qword0 < 3) { 356 | // Handle the color component that crosses the boundary between data0 and data1 357 | data = block->data0 >> block->index; 358 | data |= block->data1 << (64 - block->index); 359 | int i = components_in_qword0; 360 | for (int j = 0; j < nu_subsets; j++) // For each subset. 361 | for (int k = 0; k < 2; k++) { // For each endpoint. 362 | endpoint_array[j * 8 + k * 4 + i] = data & mask; 363 | data >>= precision; 364 | } 365 | block->index += total_bits_per_component; 366 | } 367 | if (components_in_qword0 < 2) { 368 | // Handle the color component that is wholly in data1. 369 | data = block->data1 >> (block->index - 64); 370 | int i = 2; 371 | for (int j = 0; j < nu_subsets; j++) // For each subset. 372 | for (int k = 0; k < 2; k++) { // For each endpoint. 373 | endpoint_array[j * 8 + k * 4 + i] = data & mask; 374 | data >>= precision; 375 | } 376 | block->index += total_bits_per_component; 377 | } 378 | // Alpha component. 379 | if (GetAlphaComponentPrecision(mode) > 0) { 380 | // For mode 7, the alpha data is wholly in data1. 381 | // For modes 4 and 6, the alpha data is wholly in data0. 382 | // For mode 5, the alpha data is in data0 and data1. 383 | if (mode == 7) 384 | data = block->data1 >> (block->index - 64); 385 | else if (mode == 5) 386 | data = (block->data0 >> block->index) | ((block->data1 & 0x3) << 14); 387 | else 388 | data = block->data0 >> block->index; 389 | uint8_t alpha_precision = GetAlphaComponentPrecision(mode); 390 | uint8_t mask = (1 << alpha_precision) - 1; 391 | for (int j = 0; j < nu_subsets; j++) 392 | for (int k = 0; k < 2; k++) { // For each endpoint. 393 | endpoint_array[j * 8 + k * 4 + 3] = data & mask; 394 | data >>= alpha_precision; 395 | } 396 | block->index += nu_subsets * 2 * alpha_precision; 397 | } 398 | } 399 | 400 | static const uint8_t mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 }; 401 | 402 | static void FullyDecodeEndpoints(uint8_t * DETEX_RESTRICT endpoint_array, int nu_subsets, 403 | int mode, detexBlock128 * DETEX_RESTRICT block) { 404 | if (mode_has_p_bits[mode]) { 405 | // Mode 1 (shared P-bits) handled elsewhere. 406 | // Extract end-point P-bits. 407 | uint32_t bits; 408 | if (block->index < 64) 409 | { 410 | bits = (uint32_t)(block->data0 >> block->index); 411 | if ((block->index + nu_subsets * 2) > 64) 412 | { 413 | bits |= (block->data1 << (64 - block->index)); 414 | } 415 | } 416 | else 417 | bits = (uint32_t)(block->data1 >> (block->index - 64)); 418 | for (int i = 0; i < nu_subsets * 2; i++) { 419 | endpoint_array[i * 4 + 0] <<= 1; 420 | endpoint_array[i * 4 + 1] <<= 1; 421 | endpoint_array[i * 4 + 2] <<= 1; 422 | endpoint_array[i * 4 + 3] <<= 1; 423 | endpoint_array[i * 4 + 0] |= (bits & 1); 424 | endpoint_array[i * 4 + 1] |= (bits & 1); 425 | endpoint_array[i * 4 + 2] |= (bits & 1); 426 | endpoint_array[i * 4 + 3] |= (bits & 1); 427 | bits >>= 1; 428 | } 429 | block->index += nu_subsets * 2; 430 | } 431 | int color_prec = GetColorComponentPrecisionPlusPbit(mode); 432 | int alpha_prec = GetAlphaComponentPrecisionPlusPbit(mode); 433 | for (int i = 0; i < nu_subsets * 2; i++) { 434 | // Color_component_precision & alpha_component_precision includes pbit 435 | // left shift endpoint components so that their MSB lies in bit 7 436 | endpoint_array[i * 4 + 0] <<= (8 - color_prec); 437 | endpoint_array[i * 4 + 1] <<= (8 - color_prec); 438 | endpoint_array[i * 4 + 2] <<= (8 - color_prec); 439 | endpoint_array[i * 4 + 3] <<= (8 - alpha_prec); 440 | 441 | // Replicate each component's MSB into the LSBs revealed by the left-shift operation above. 442 | endpoint_array[i * 4 + 0] |= (endpoint_array[i * 4 + 0] >> color_prec); 443 | endpoint_array[i * 4 + 1] |= (endpoint_array[i * 4 + 1] >> color_prec); 444 | endpoint_array[i * 4 + 2] |= (endpoint_array[i * 4 + 2] >> color_prec); 445 | endpoint_array[i * 4 + 3] |= (endpoint_array[i * 4 + 3] >> alpha_prec); 446 | } 447 | if (mode <= 3) { 448 | for (int i = 0; i < nu_subsets * 2; i++) 449 | endpoint_array[i * 4 + 3] = 0xFF; 450 | } 451 | } 452 | 453 | static uint8_t Interpolate(uint8_t e0, uint8_t e1, uint8_t index, uint8_t indexprecision) { 454 | if (indexprecision == 2) 455 | return (uint8_t)(((64 - detex_bptc_table_aWeight2[index]) * (uint16_t)e0 456 | + detex_bptc_table_aWeight2[index] * (uint16_t)e1 + 32) >> 6); 457 | else 458 | if (indexprecision == 3) 459 | return (uint8_t)(((64 - detex_bptc_table_aWeight3[index]) * (uint16_t)e0 460 | + detex_bptc_table_aWeight3[index] * (uint16_t)e1 + 32) >> 6); 461 | else // indexprecision == 4 462 | return (uint8_t)(((64 - detex_bptc_table_aWeight4[index]) * (uint16_t)e0 463 | + detex_bptc_table_aWeight4[index] * (uint16_t)e1 + 32) >> 6); 464 | } 465 | 466 | static const uint8_t bptc_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 }; 467 | 468 | static DETEX_INLINE_ONLY int GetColorIndexBitcount(int mode, int index_selection_bit) { 469 | // If the index selection bit is set for mode 4, return 3, otherwise 2. 470 | return bptc_color_index_bitcount[mode] + index_selection_bit; 471 | } 472 | 473 | static uint8_t bptc_alpha_index_bitcount[8] = { 3, 3, 2, 2, 3, 2, 4, 2 }; 474 | 475 | static DETEX_INLINE_ONLY int GetAlphaIndexBitcount(int mode, int index_selection_bit) { 476 | // If the index selection bit is set for mode 4, return 2, otherwise 3. 477 | return bptc_alpha_index_bitcount[mode] - index_selection_bit; 478 | } 479 | 480 | static const uint8_t bptc_NS[8] = { 3, 2, 3, 2, 1, 1, 1, 2 }; 481 | 482 | static DETEX_INLINE_ONLY int GetNumberOfSubsets(int mode) { 483 | return bptc_NS[mode]; 484 | } 485 | 486 | static const uint8_t PB[8] = { 4, 6, 6, 6, 0, 0, 0, 6 }; 487 | 488 | static DETEX_INLINE_ONLY int GetNumberOfPartitionBits(int mode) { 489 | return PB[mode]; 490 | } 491 | 492 | static const uint8_t RB[8] = { 0, 0, 0, 0, 2, 2, 0, 0 }; 493 | 494 | static DETEX_INLINE_ONLY int GetNumberOfRotationBits(int mode) { 495 | return RB[mode]; 496 | } 497 | 498 | // Functions to extract parameters. */ 499 | 500 | static int ExtractMode(detexBlock128 *block) { 501 | for (int i = 0; i < 8; i++) 502 | if (block->data0 & ((uint64_t)1 << i)) { 503 | block->index = i + 1; 504 | return i; 505 | } 506 | // Illegal. 507 | return -1; 508 | } 509 | 510 | static DETEX_INLINE_ONLY int ExtractPartitionSetID(detexBlock128 *block, int mode) { 511 | return detexBlock128ExtractBits(block, GetNumberOfPartitionBits(mode)); 512 | } 513 | 514 | static DETEX_INLINE_ONLY int GetPartitionIndex(int nu_subsets, int partition_set_id, int i) { 515 | if (nu_subsets == 1) 516 | return 0; 517 | if (nu_subsets == 2) 518 | return detex_bptc_table_P2[partition_set_id * 16 + i]; 519 | return detex_bptc_table_P3[partition_set_id * 16 + i]; 520 | } 521 | 522 | static DETEX_INLINE_ONLY int ExtractRotationBits(detexBlock128 *block, int mode) { 523 | return detexBlock128ExtractBits(block, GetNumberOfRotationBits(mode)); 524 | } 525 | 526 | static DETEX_INLINE_ONLY int GetAnchorIndex(int partition_set_id, int partition, int nu_subsets) { 527 | if (partition == 0) 528 | return 0; 529 | if (nu_subsets == 2) 530 | return detex_bptc_table_anchor_index_second_subset[partition_set_id]; 531 | if (partition == 1) 532 | return detex_bptc_table_anchor_index_second_subset_of_three[partition_set_id]; 533 | return detex_bptc_table_anchor_index_third_subset[partition_set_id]; 534 | } 535 | 536 | static const uint8_t IB[8] = { 3, 3, 2, 2, 2, 2, 4, 2 }; 537 | static const uint8_t IB2[8] = { 0, 0, 0, 0, 3, 2, 0, 0 }; 538 | static const uint8_t mode_has_partition_bits[8] = { 1, 1, 1, 1, 0, 0, 0, 1 }; 539 | 540 | /* Decompress a 128-bit 4x4 pixel texture block compressed using BPTC mode 1. */ 541 | 542 | static bool DecompressBlockBPTCMode1(detexBlock128 * DETEX_RESTRICT block, 543 | uint8_t * DETEX_RESTRICT pixel_buffer) { 544 | uint64_t data0 = block->data0; 545 | uint64_t data1 = block->data1; 546 | int partition_set_id = detexGetBits64(data0, 2, 7); 547 | uint8_t endpoint[2 * 2 * 3]; // 2 subsets. 548 | endpoint[0] = detexGetBits64(data0, 8, 13); // red, subset 0, endpoint 0 549 | endpoint[3] = detexGetBits64(data0, 14, 19); // red, subset 0, endpoint 1 550 | endpoint[6] = detexGetBits64(data0, 20, 25); // red, subset 1, endpoint 0 551 | endpoint[9] = detexGetBits64(data0, 26, 31); // red, subset 1, endpoint 1 552 | endpoint[1] = detexGetBits64(data0, 32, 37); // green, subset 0, endpoint 0 553 | endpoint[4] = detexGetBits64(data0, 38, 43); // green, subset 0, endpoint 1 554 | endpoint[7] = detexGetBits64(data0, 44, 49); // green, subset 1, endpoint 0 555 | endpoint[10] = detexGetBits64(data0, 50, 55); // green, subset 1, endpoint 1 556 | endpoint[2] = detexGetBits64(data0, 56, 61); // blue, subset 0, endpoint 0 557 | endpoint[5] = detexGetBits64(data0, 62, 63) // blue, subset 0, endpoint 1 558 | | (detexGetBits64(data1, 0, 3) << 2); 559 | endpoint[8] = detexGetBits64(data1, 4, 9); // blue, subset 1, endpoint 0 560 | endpoint[11] = detexGetBits64(data1, 10, 15); // blue, subset 1, endpoint 1 561 | // Decode endpoints. 562 | for (int i = 0; i < 2 * 2; i++) { 563 | //component-wise left-shift 564 | endpoint[i * 3 + 0] <<= 2; 565 | endpoint[i * 3 + 1] <<= 2; 566 | endpoint[i * 3 + 2] <<= 2; 567 | } 568 | // P-bit is shared. 569 | uint8_t pbit_zero = detexGetBits64(data1, 16, 16) << 1; 570 | uint8_t pbit_one = detexGetBits64(data1, 17, 17) << 1; 571 | // RGB only pbits for mode 1, one for each subset. 572 | for (int j = 0; j < 3; j++) { 573 | endpoint[0 * 3 + j] |= pbit_zero; 574 | endpoint[1 * 3 + j] |= pbit_zero; 575 | endpoint[2 * 3 + j] |= pbit_one; 576 | endpoint[3 * 3 + j] |= pbit_one; 577 | } 578 | for (int i = 0; i < 2 * 2; i++) { 579 | // Replicate each component's MSB into the LSB. 580 | endpoint[i * 3 + 0] |= endpoint[i * 3 + 0] >> 7; 581 | endpoint[i * 3 + 1] |= endpoint[i * 3 + 1] >> 7; 582 | endpoint[i * 3 + 2] |= endpoint[i * 3 + 2] >> 7; 583 | } 584 | 585 | uint8_t subset_index[16]; 586 | for (int i = 0; i < 16; i++) 587 | // subset_index[i] is a number from 0 to 1. 588 | subset_index[i] = detex_bptc_table_P2[partition_set_id * 16 + i]; 589 | uint8_t anchor_index[2]; 590 | anchor_index[0] = 0; 591 | anchor_index[1] = detex_bptc_table_anchor_index_second_subset[partition_set_id]; 592 | uint8_t color_index[16]; 593 | // Extract primary index bits. 594 | data1 >>= 18; 595 | for (int i = 0; i < 16; i++) 596 | if (i == anchor_index[subset_index[i]]) { 597 | // Highest bit is zero. 598 | color_index[i] = data1 & 3; // Get two bits. 599 | data1 >>= 2; 600 | } 601 | else { 602 | color_index[i] = data1 & 7; // Get three bits. 603 | data1 >>= 3; 604 | } 605 | 606 | uint32_t *pixel32_buffer = (uint32_t *)pixel_buffer; 607 | for (int i = 0; i < 16; i++) { 608 | uint8_t endpoint_start[3]; 609 | uint8_t endpoint_end[3]; 610 | for (int j = 0; j < 3; j++) { 611 | endpoint_start[j] = endpoint[2 * subset_index[i] * 3 + j]; 612 | endpoint_end[j] = endpoint[(2 * subset_index[i] + 1) * 3 + j]; 613 | } 614 | uint32_t output; 615 | output = detexPack32R8(Interpolate(endpoint_start[0], endpoint_end[0], color_index[i], 3)); 616 | output |= detexPack32G8(Interpolate(endpoint_start[1], endpoint_end[1], color_index[i], 3)); 617 | output |= detexPack32B8(Interpolate(endpoint_start[2], endpoint_end[2], color_index[i], 3)); 618 | output |= detexPack32A8(0xFF); 619 | pixel32_buffer[i] = output; 620 | } 621 | return true; 622 | } 623 | 624 | /* Decompress a 128-bit 4x4 pixel texture block compressed using the BPTC */ 625 | /* (BC7) format. */ 626 | bool detexDecompressBlockBPTC(const uint8_t * DETEX_RESTRICT bitstring, uint32_t mode_mask, 627 | uint32_t flags, uint8_t * DETEX_RESTRICT pixel_buffer) { 628 | detexBlock128 block; 629 | block.data0 = *(uint64_t *)&bitstring[0]; 630 | block.data1 = *(uint64_t *)&bitstring[8]; 631 | block.index = 0; 632 | int mode = ExtractMode(&block); 633 | if (mode == -1) 634 | return 0; 635 | // Allow compression tied to specific modes (according to mode_mask). 636 | if (!(mode_mask & ((int)1 << mode))) 637 | return 0; 638 | if (mode >= 4 && (flags & DETEX_DECOMPRESS_FLAG_OPAQUE_ONLY)) 639 | return 0; 640 | if (mode < 4 && (flags & DETEX_DECOMPRESS_FLAG_NON_OPAQUE_ONLY)) 641 | return 0; 642 | if (mode == 1) 643 | return DecompressBlockBPTCMode1(&block, pixel_buffer); 644 | 645 | int nu_subsets = 1; 646 | int partition_set_id = 0; 647 | if (mode_has_partition_bits[mode]) { 648 | nu_subsets = GetNumberOfSubsets(mode); 649 | partition_set_id = ExtractPartitionSetID(&block, mode); 650 | } 651 | int rotation = ExtractRotationBits(&block, mode); 652 | int index_selection_bit = 0; 653 | if (mode == 4) 654 | index_selection_bit = detexBlock128ExtractBits(&block, 1); 655 | 656 | int alpha_index_bitcount = GetAlphaIndexBitcount(mode, index_selection_bit); 657 | int color_index_bitcount = GetColorIndexBitcount(mode, index_selection_bit); 658 | 659 | uint8_t endpoint_array[3 * 2 * 4]; // Max. 3 subsets. 660 | ExtractEndpoints(mode, nu_subsets, &block, endpoint_array); 661 | FullyDecodeEndpoints(endpoint_array, nu_subsets, mode, &block); 662 | 663 | uint8_t subset_index[16]; 664 | for (int i = 0; i < 16; i++) 665 | // subset_index[i] is a number from 0 to 2, or 0 to 1, or 0 depending on the number of subsets. 666 | subset_index[i] = GetPartitionIndex(nu_subsets, partition_set_id, i); 667 | uint8_t anchor_index[4] = { 0, 0, 0, 0 }; // Only need max. 3 elements. 668 | for (int i = 0; i < nu_subsets; i++) 669 | anchor_index[i] = GetAnchorIndex(partition_set_id, i, nu_subsets); 670 | uint8_t color_index[16]; 671 | uint8_t alpha_index[16]; 672 | memset(color_index, 0, sizeof(color_index)); 673 | memset(alpha_index, 0, sizeof(alpha_index)); 674 | // Extract primary index bits. 675 | uint64_t data1; 676 | if (block.index >= 64) { 677 | // Because the index bits are all in the second 64-bit word, there is no need to use 678 | // block_extract_bits(). 679 | // This implies the mode is not 4. 680 | data1 = block.data1 >> (block.index - 64); 681 | uint8_t mask1 = (1 << IB[mode]) - 1; 682 | uint8_t mask2 = (1 << (IB[mode] - 1)) - 1; 683 | for (int i = 0; i < 16; i++) 684 | if (i == anchor_index[subset_index[i]]) { 685 | // Highest bit is zero. 686 | color_index[i] = data1 & mask2; 687 | data1 >>= IB[mode] - 1; 688 | alpha_index[i] = color_index[i]; 689 | } 690 | else { 691 | color_index[i] = data1 & mask1; 692 | data1 >>= IB[mode]; 693 | alpha_index[i] = color_index[i]; 694 | } 695 | } 696 | else { // Implies mode 4. 697 | // Because the bits cross the 64-bit word boundary, we have to be careful. 698 | // Block index is 50 at this point. 699 | uint64_t data = block.data0 >> 50; 700 | data |= block.data1 << 14; 701 | for (int i = 0; i < 16; i++) 702 | if (i == anchor_index[subset_index[i]]) { 703 | // Highest bit is zero. 704 | if (index_selection_bit) { // Implies mode == 4. 705 | alpha_index[i] = data & 0x1; 706 | data >>= 1; 707 | } 708 | else { 709 | color_index[i] = data & 0x1; 710 | data >>= 1; 711 | } 712 | } 713 | else { 714 | if (index_selection_bit) { // Implies mode == 4. 715 | alpha_index[i] = data & 0x3; 716 | data >>= 2; 717 | } 718 | else { 719 | color_index[i] = data & 0x3; 720 | data >>= 2; 721 | } 722 | } 723 | // Block index is 81 at this point. 724 | data1 = block.data1 >> (81 - 64); 725 | } 726 | // Extract secondary index bits. 727 | if (IB2[mode] > 0) { 728 | uint8_t mask1 = (1 << IB2[mode]) - 1; 729 | uint8_t mask2 = (1 << (IB2[mode] - 1)) - 1; 730 | for (int i = 0; i < 16; i++) 731 | if (i == anchor_index[subset_index[i]]) { 732 | // Highest bit is zero. 733 | if (index_selection_bit) { 734 | color_index[i] = data1 & 0x3; 735 | data1 >>= 2; 736 | } 737 | else { 738 | // alpha_index[i] = block_extract_bits(&block, IB2[mode] - 1); 739 | alpha_index[i] = data1 & mask2; 740 | data1 >>= IB2[mode] - 1; 741 | } 742 | } 743 | else { 744 | if (index_selection_bit) { 745 | color_index[i] = data1 & 0x7; 746 | data1 >>= 3; 747 | } 748 | else { 749 | // alpha_index[i] = block_extract_bits(&block, IB2[mode]); 750 | alpha_index[i] = data1 & mask1; 751 | data1 >>= IB2[mode]; 752 | } 753 | } 754 | } 755 | 756 | uint32_t *pixel32_buffer = (uint32_t *)pixel_buffer; 757 | for (int i = 0; i < 16; i++) { 758 | uint8_t endpoint_start[4]; 759 | uint8_t endpoint_end[4]; 760 | for (int j = 0; j < 4; j++) { 761 | endpoint_start[j] = endpoint_array[2 * subset_index[i] * 4 + j]; 762 | endpoint_end[j] = endpoint_array[(2 * subset_index[i] + 1) * 4 + j]; 763 | } 764 | 765 | uint32_t output = 0; 766 | output = detexPack32R8(Interpolate(endpoint_start[0], endpoint_end[0], color_index[i], color_index_bitcount)); 767 | output |= detexPack32G8(Interpolate(endpoint_start[1], endpoint_end[1], color_index[i], color_index_bitcount)); 768 | output |= detexPack32B8(Interpolate(endpoint_start[2], endpoint_end[2], color_index[i], color_index_bitcount)); 769 | output |= detexPack32A8(Interpolate(endpoint_start[3], endpoint_end[3], alpha_index[i], alpha_index_bitcount)); 770 | 771 | if (rotation > 0) { 772 | if (rotation == 1) 773 | output = detexPack32RGBA8(detexPixel32GetA8(output), detexPixel32GetG8(output), 774 | detexPixel32GetB8(output), detexPixel32GetR8(output)); 775 | else 776 | if (rotation == 2) 777 | output = detexPack32RGBA8(detexPixel32GetR8(output), detexPixel32GetA8(output), 778 | detexPixel32GetB8(output), detexPixel32GetG8(output)); 779 | else // rotation == 3 780 | output = detexPack32RGBA8(detexPixel32GetR8(output), detexPixel32GetG8(output), 781 | detexPixel32GetA8(output), detexPixel32GetB8(output)); 782 | } 783 | pixel32_buffer[i] = output; 784 | } 785 | return true; 786 | } 787 | 788 | /* Return the internal mode of the BPTC block. */ 789 | uint32_t detexGetModeBPTC(const uint8_t *bitstring) { 790 | detexBlock128 block; 791 | block.data0 = *(uint64_t *)&bitstring[0]; 792 | block.data1 = *(uint64_t *)&bitstring[8]; 793 | block.index = 0; 794 | int mode = ExtractMode(&block); 795 | return mode; 796 | } 797 | 798 | void detexSetModeBPTC(uint8_t *bitstring, uint32_t mode, uint32_t flags, 799 | uint32_t *colors) { 800 | // Mode 0 starts with 1 801 | // Mode 1 starts with 01 802 | // ... 803 | // Mode 7 starts with 00000001 804 | int bit = 0x1 << mode; 805 | bitstring[0] &= ~(bit - 1); 806 | bitstring[0] |= bit; 807 | return; 808 | } 809 | 810 | -------------------------------------------------------------------------------- /bc7decomp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _MSC_VER 4 | #define DETEX_INLINE_ONLY __forceinline 5 | #define DETEX_RESTRICT __restrict 6 | #else 7 | #define DETEX_INLINE_ONLY 8 | #define DETEX_RESTRICT 9 | #endif 10 | 11 | enum { 12 | /* Function returns false (invalid block) when the compressed block */ 13 | /* is in a format not allowed to be generated by an encoder. */ 14 | DETEX_DECOMPRESS_FLAG_ENCODE = 0x1, 15 | /* For compression formats that have opaque and non-opaque modes, */ 16 | /* return false (invalid block) when the compressed block is encoded */ 17 | /* using a non-opaque mode. */ 18 | DETEX_DECOMPRESS_FLAG_OPAQUE_ONLY = 0x2, 19 | /* For compression formats that have opaque and non-opaque modes, */ 20 | /* return false (invalid block) when the compressed block is encoded */ 21 | /* using an opaque mode. */ 22 | DETEX_DECOMPRESS_FLAG_NON_OPAQUE_ONLY = 0x4, 23 | }; 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | bool detexDecompressBlockBPTC(const uint8_t * DETEX_RESTRICT bitstring, uint32_t mode_mask, 30 | uint32_t flags, uint8_t * DETEX_RESTRICT pixel_buffer); 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | -------------------------------------------------------------------------------- /bc7enc.cpp: -------------------------------------------------------------------------------- 1 | // bc7enc.cpp - bc7enc17.c command line example/test app 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "bc7enc16.h" 11 | #include "lodepng.h" 12 | #include "dds_defs.h" 13 | #include "ktx_defs.h" 14 | #include "bc7decomp.h" 15 | 16 | template inline T clamp(T v, T l, T h) { if (v < l) v = l; else if (v > h) v = h; return v; } 17 | inline int iabs(int i) { if (i < 0) i = -i; return i; } 18 | 19 | static int print_usage() 20 | { 21 | fprintf(stderr, "bc7enc\n"); 22 | fprintf(stderr, "Reads PNG files (with or without alpha channels) and packs them to BC7/BPTC using modes 1 and 6.\n"); 23 | fprintf(stderr, "This tool works best with opaque images, or on images with relatively simple alpha channels.\n"); 24 | fprintf(stderr, "By default, a DX10 DDS file and a unpacked PNG file will be written to the source file's directory with the .dds/_unpacked.png/_unpacked_alpha.png suffixes.\n\n"); 25 | fprintf(stderr, "Usage: bc7enc [-apng_filename] [-l] [-uX] [-aX] [-g] [-y] input_filename.png [compressed_output.dds] [unpacked_output.png]\n"); 26 | fprintf(stderr, "-apng_filename Load G channel of PNG file into alpha channel of source image\n"); 27 | fprintf(stderr, "-l Use linear colorspace metrics instead of perceptual\n"); 28 | fprintf(stderr, "-uX Higher quality levels, X ranges from [0,4], higher=slower\n"); 29 | fprintf(stderr, "-pX Scan X partitions in mode 1, X ranges from [0,64], use 0 to disable mode 1 entirely (faster)\n"); 30 | fprintf(stderr, "-g Don't write an unpacked output PNG file\n"); 31 | fprintf(stderr, "-y Flip source image along Y axis before packing\n"); 32 | fprintf(stderr, "-k Generate .ktx file instead of .dds file\n"); 33 | 34 | return EXIT_FAILURE; 35 | } 36 | 37 | struct color_quad_u8 38 | { 39 | uint8_t m_c[4]; 40 | 41 | inline color_quad_u8(uint8_t r, uint8_t g, uint8_t b, uint8_t a) 42 | { 43 | set(r, g, b, a); 44 | } 45 | 46 | inline color_quad_u8(uint8_t y = 0, uint8_t a = 255) 47 | { 48 | set(y, a); 49 | } 50 | 51 | inline color_quad_u8 &set(uint8_t y, uint8_t a = 255) 52 | { 53 | m_c[0] = y; 54 | m_c[1] = y; 55 | m_c[2] = y; 56 | m_c[3] = a; 57 | return *this; 58 | } 59 | 60 | inline color_quad_u8 &set(uint8_t r, uint8_t g, uint8_t b, uint8_t a) 61 | { 62 | m_c[0] = r; 63 | m_c[1] = g; 64 | m_c[2] = b; 65 | m_c[3] = a; 66 | return *this; 67 | } 68 | 69 | inline uint8_t &operator[] (uint32_t i) { assert(i < 4); return m_c[i]; } 70 | inline uint8_t operator[] (uint32_t i) const { assert(i < 4); return m_c[i]; } 71 | 72 | inline int get_luma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; } // REC709 weightings 73 | }; 74 | typedef std::vector color_quad_u8_vec; 75 | 76 | class image_u8 77 | { 78 | public: 79 | image_u8() : 80 | m_width(0), m_height(0) 81 | { 82 | } 83 | 84 | image_u8(uint32_t width, uint32_t height) : 85 | m_width(width), m_height(height) 86 | { 87 | m_pixels.resize(width * height); 88 | } 89 | 90 | inline const color_quad_u8_vec &get_pixels() const { return m_pixels; } 91 | inline color_quad_u8_vec &get_pixels() { return m_pixels; } 92 | 93 | inline uint32_t width() const { return m_width; } 94 | inline uint32_t height() const { return m_height; } 95 | inline uint32_t total_pixels() const { return m_width * m_height; } 96 | 97 | inline color_quad_u8 &operator()(uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + m_width * y]; } 98 | inline const color_quad_u8 &operator()(uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + m_width * y]; } 99 | 100 | image_u8& clear() 101 | { 102 | m_width = m_height = 0; 103 | m_pixels.clear(); 104 | return *this; 105 | } 106 | 107 | image_u8& init(uint32_t width, uint32_t height) 108 | { 109 | clear(); 110 | 111 | m_width = width; 112 | m_height = height; 113 | m_pixels.resize(width * height); 114 | return *this; 115 | } 116 | 117 | image_u8& set_all(const color_quad_u8 &p) 118 | { 119 | for (uint32_t i = 0; i < m_pixels.size(); i++) 120 | m_pixels[i] = p; 121 | return *this; 122 | } 123 | 124 | image_u8& crop(uint32_t new_width, uint32_t new_height) 125 | { 126 | if ((m_width == new_width) && (m_height == new_height)) 127 | return *this; 128 | 129 | image_u8 new_image(new_width, new_height); 130 | 131 | const uint32_t w = std::min(m_width, new_width); 132 | const uint32_t h = std::min(m_height, new_height); 133 | 134 | for (uint32_t y = 0; y < h; y++) 135 | for (uint32_t x = 0; x < w; x++) 136 | new_image(x, y) = (*this)(x, y); 137 | 138 | return swap(new_image); 139 | } 140 | 141 | image_u8 &swap(image_u8 &other) 142 | { 143 | std::swap(m_width, other.m_width); 144 | std::swap(m_height, other.m_height); 145 | std::swap(m_pixels, other.m_pixels); 146 | return *this; 147 | } 148 | 149 | inline void get_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8 *pPixels) 150 | { 151 | assert((bx * width + width) <= m_width); 152 | assert((by * height + height) <= m_height); 153 | 154 | for (uint32_t y = 0; y < height; y++) 155 | memcpy(pPixels + y * width, &(*this)(bx * width, by * height + y), width * sizeof(color_quad_u8)); 156 | } 157 | 158 | inline void set_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, const color_quad_u8 *pPixels) 159 | { 160 | assert((bx * width + width) <= m_width); 161 | assert((by * height + height) <= m_height); 162 | 163 | for (uint32_t y = 0; y < height; y++) 164 | memcpy(&(*this)(bx * width, by * height + y), pPixels + y * width, width * sizeof(color_quad_u8)); 165 | } 166 | 167 | image_u8 &swizzle(uint32_t r, uint32_t g, uint32_t b, uint32_t a) 168 | { 169 | assert((r | g | b | a) <= 3); 170 | for (uint32_t y = 0; y < m_height; y++) 171 | { 172 | for (uint32_t x = 0; x < m_width; x++) 173 | { 174 | color_quad_u8 tmp((*this)(x, y)); 175 | (*this)(x, y).set(tmp[r], tmp[g], tmp[b], tmp[a]); 176 | } 177 | } 178 | 179 | return *this; 180 | } 181 | 182 | private: 183 | color_quad_u8_vec m_pixels; 184 | uint32_t m_width, m_height; 185 | }; 186 | 187 | static bool load_png(const char *pFilename, image_u8 &img) 188 | { 189 | img.clear(); 190 | 191 | std::vector pixels; 192 | unsigned int w = 0, h = 0; 193 | unsigned int e = lodepng::decode(pixels, w, h, pFilename); 194 | if (e != 0) 195 | { 196 | fprintf(stderr, "Failed loading PNG file %s\n", pFilename); 197 | return false; 198 | } 199 | 200 | img.init(w, h); 201 | memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t)); 202 | 203 | return true; 204 | } 205 | 206 | static bool save_png(const char *pFilename, const image_u8 &img, bool save_alpha) 207 | { 208 | const uint32_t w = img.width(); 209 | const uint32_t h = img.height(); 210 | 211 | std::vector pixels; 212 | if (save_alpha) 213 | { 214 | pixels.resize(w * h * sizeof(color_quad_u8)); 215 | memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8)); 216 | } 217 | else 218 | { 219 | pixels.resize(w * h * 3); 220 | unsigned char *pDst = &pixels[0]; 221 | for (uint32_t y = 0; y < h; y++) 222 | for (uint32_t x = 0; x < w; x++, pDst += 3) 223 | pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2]; 224 | } 225 | 226 | return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0; 227 | } 228 | 229 | class image_metrics 230 | { 231 | public: 232 | double m_max, m_mean, m_mean_squared, m_root_mean_squared, m_peak_snr; 233 | 234 | image_metrics() 235 | { 236 | clear(); 237 | } 238 | 239 | void clear() 240 | { 241 | memset(this, 0, sizeof(*this)); 242 | } 243 | 244 | void compute(const image_u8 &a, const image_u8 &b, uint32_t first_channel, uint32_t num_channels) 245 | { 246 | const bool average_component_error = true; 247 | 248 | const uint32_t width = std::min(a.width(), b.width()); 249 | const uint32_t height = std::min(a.height(), b.height()); 250 | 251 | assert((first_channel < 4U) && (first_channel + num_channels <= 4U)); 252 | 253 | // Histogram approach originally due to Charles Bloom. 254 | double hist[256]; 255 | memset(hist, 0, sizeof(hist)); 256 | 257 | for (uint32_t y = 0; y < height; y++) 258 | { 259 | for (uint32_t x = 0; x < width; x++) 260 | { 261 | const color_quad_u8 &ca = a(x, y); 262 | const color_quad_u8 &cb = b(x, y); 263 | 264 | if (!num_channels) 265 | hist[iabs(ca.get_luma() - cb.get_luma())]++; 266 | else 267 | { 268 | for (uint32_t c = 0; c < num_channels; c++) 269 | hist[iabs(ca[first_channel + c] - cb[first_channel + c])]++; 270 | } 271 | } 272 | } 273 | 274 | m_max = 0; 275 | double sum = 0.0f, sum2 = 0.0f; 276 | for (uint32_t i = 0; i < 256; i++) 277 | { 278 | if (!hist[i]) 279 | continue; 280 | 281 | m_max = std::max(m_max, i); 282 | 283 | double x = i * hist[i]; 284 | 285 | sum += x; 286 | sum2 += i * x; 287 | } 288 | 289 | // See http://richg42.blogspot.com/2016/09/how-to-compute-psnr-from-old-berkeley.html 290 | double total_values = width * height; 291 | 292 | if (average_component_error) 293 | total_values *= clamp(num_channels, 1, 4); 294 | 295 | m_mean = clamp(sum / total_values, 0.0f, 255.0f); 296 | m_mean_squared = clamp(sum2 / total_values, 0.0f, 255.0f * 255.0f); 297 | 298 | m_root_mean_squared = sqrt(m_mean_squared); 299 | 300 | if (!m_root_mean_squared) 301 | m_peak_snr = 1e+10f; 302 | else 303 | m_peak_snr = clamp(log10(255.0f / m_root_mean_squared) * 20.0f, 0.0f, 500.0f); 304 | } 305 | }; 306 | 307 | struct bc7_block 308 | { 309 | uint64_t m_vals[2]; 310 | }; 311 | 312 | typedef std::vector bc7_block_vec; 313 | 314 | static bool save_bc7_dds(const char *pFilename, uint32_t width, uint32_t height, const bc7_block *pBlocks, bool srgb) 315 | { 316 | (void)srgb; 317 | 318 | FILE *pFile = NULL; 319 | pFile = fopen(pFilename, "wb"); 320 | if (!pFile) 321 | { 322 | fprintf(stderr, "Failed creating file %s!\n", pFilename); 323 | return false; 324 | } 325 | 326 | fwrite("DDS ", 4, 1, pFile); 327 | 328 | DDSURFACEDESC2 desc; 329 | memset(&desc, 0, sizeof(desc)); 330 | 331 | desc.dwSize = sizeof(desc); 332 | desc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CAPS; 333 | 334 | desc.dwWidth = width; 335 | desc.dwHeight = height; 336 | 337 | desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE; 338 | desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat); 339 | 340 | desc.ddpfPixelFormat.dwFlags |= DDPF_FOURCC; 341 | 342 | desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0'); 343 | desc.ddpfPixelFormat.dwRGBBitCount = 0; 344 | 345 | const uint32_t pixel_format_bpp = 8; 346 | desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3; 347 | desc.dwFlags |= DDSD_LINEARSIZE; 348 | 349 | fwrite(&desc, sizeof(desc), 1, pFile); 350 | 351 | DDS_HEADER_DXT10 hdr10; 352 | memset(&hdr10, 0, sizeof(hdr10)); 353 | 354 | // Not all tools support DXGI_FORMAT_BC7_UNORM_SRGB (like NVTT), but ddsview in DirectXTex pays attention to it. So not sure what to do here. 355 | // For best compatibility just write DXGI_FORMAT_BC7_UNORM. 356 | //hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM; 357 | hdr10.dxgiFormat = DXGI_FORMAT_BC7_UNORM; 358 | hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D; 359 | hdr10.arraySize = 1; 360 | 361 | fwrite(&hdr10, sizeof(hdr10), 1, pFile); 362 | 363 | fwrite(pBlocks, desc.lPitch, 1, pFile); 364 | 365 | if (fclose(pFile) == EOF) 366 | { 367 | fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename); 368 | return false; 369 | } 370 | 371 | return true; 372 | } 373 | 374 | static bool save_bc7_ktx(const char *pFilename, 375 | uint32_t width, uint32_t height, 376 | const bc7_block *pBlocks, bool srgb, bool has_alpha) 377 | { 378 | (void) has_alpha; // RGB without A is currently unsupported 379 | FILE *pFile = NULL; 380 | pFile = fopen(pFilename, "wb"); 381 | if (!pFile) 382 | { 383 | fprintf(stderr, "Failed creating file %s!\n", pFilename); 384 | return false; 385 | } 386 | 387 | uint32_t keyValueSizeBrutto = 0; 388 | 389 | // 390 | // key/value pair length computation 391 | // 392 | 393 | // first key/value pair 394 | uint32_t keyValueKtxOrientationSizeNetto = sizeof(ktxOrientation); 395 | uint32_t keyValueKtxOrientationSizeBrutto = (keyValueKtxOrientationSizeNetto + 3) & ~3; 396 | keyValueSizeBrutto += keyValueKtxOrientationSizeBrutto + 4 /* 4 is the size of the size field */; 397 | // additional pairs 398 | // ... 399 | 400 | // 401 | // header 402 | // 403 | 404 | struct KTX_HEADER header; 405 | memcpy(header.identifier, ktxFileIdentifier, sizeof(header.identifier)); 406 | header.endianness = ktxEndianess; 407 | header.glType = 0; // 0: compressed texture 408 | header.glTypeSize = 1; // 1: endianess independent, especially for compressed textures 409 | header.glFormat = 0; // 0: compressed texture 410 | header.glInternalFormat = (srgb) ? GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM : GL_COMPRESSED_RGBA_BPTC_UNORM; // see table 8.14 and chapter C.2 of OpenGL 4.4. specification 411 | // header.glBaseInternalFormat = (has_alpha) ? GL_RGBA : GL_RGB; // see table 8.11 412 | header.glBaseInternalFormat = GL_RGBA; // see table 8.11 413 | header.pixelWidth = width; 414 | header.pixelHeight = height; 415 | header.pixelDepth = 0; 416 | header.numberOfArrayElements = 0; // numberOfArrayElements denotes the number of array elements which is the number of elements in the array or the size of the array measured in elements. 417 | header.numberOfFaces = 1; // cubemap/s/arrays: 6, otherwise: 1 418 | header.numberOfMipmapLevels = 1; // non-mipmapped: 1 419 | header.bytesOfKeyValueData = keyValueSizeBrutto; 420 | fwrite(&header, sizeof(header), 1, pFile); 421 | 422 | // 423 | // key/value pairs 424 | // 425 | 426 | const uint8_t pad[3] = {0 , 0 , 0}; 427 | 428 | // first key/value pair 429 | fwrite(&keyValueKtxOrientationSizeNetto, sizeof(keyValueKtxOrientationSizeNetto), 1, pFile); 430 | fwrite(ktxOrientation, sizeof(ktxOrientation), 1, pFile); 431 | uint32_t numOfPad = keyValueKtxOrientationSizeBrutto - keyValueKtxOrientationSizeNetto; 432 | fwrite(pad, 1, numOfPad, pFile); 433 | // additional pairs 434 | // ... 435 | 436 | // 437 | // image size 438 | // 439 | 440 | const uint32_t pixel_format_bpp = 8; // 8 bits per pixel (compressed to a quarter of that needed by RGBA) 441 | const uint32_t imageSize = (((width + 3) & ~3) * ((height + 3) & ~3) * pixel_format_bpp) >> 3; 442 | fwrite(&imageSize, sizeof(imageSize), 1, pFile); 443 | 444 | // 445 | // image data 446 | // 447 | 448 | fwrite(pBlocks, imageSize, 1, pFile); 449 | 450 | if (fclose(pFile) == EOF) 451 | { 452 | fprintf(stderr, "Failed writing to KTX file %s!\n", pFilename); 453 | return false; 454 | } 455 | 456 | return true; 457 | } 458 | 459 | static void strip_extension(std::string &s) 460 | { 461 | for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) 462 | { 463 | if (s[i] == '.') 464 | { 465 | s.resize(i); 466 | break; 467 | } 468 | } 469 | } 470 | 471 | int main(int argc, char *argv[]) 472 | { 473 | if (argc < 2) 474 | return print_usage(); 475 | 476 | std::string src_filename; 477 | std::string src_alpha_filename; 478 | std::string dds_output_filename; 479 | std::string png_output_filename; 480 | std::string png_alpha_output_filename; 481 | int uber_level = 0; 482 | int max_partitions_to_scan = BC7ENC16_MAX_PARTITIONS1; 483 | bool perceptual = true; 484 | bool no_output_png = false; 485 | bool y_flip = false; 486 | bool ktx = false; 487 | 488 | for (int i = 1; i < argc; i++) 489 | { 490 | const char *pArg = argv[i]; 491 | if (pArg[0] == '-') 492 | { 493 | switch (pArg[1]) 494 | { 495 | case 'y': 496 | { 497 | y_flip = true; 498 | break; 499 | } 500 | case 'a': 501 | { 502 | src_alpha_filename = pArg + 2; 503 | break; 504 | } 505 | case 'u': 506 | { 507 | uber_level = atoi(pArg + 2); 508 | if ((uber_level < 0) || (uber_level > BC7ENC16_MAX_UBER_LEVEL)) 509 | { 510 | fprintf(stderr, "Invalid argument: %s\n", pArg); 511 | return EXIT_FAILURE; 512 | } 513 | break; 514 | 515 | } 516 | case 'g': 517 | { 518 | no_output_png = true; 519 | break; 520 | } 521 | case 'l': 522 | { 523 | perceptual = false; 524 | break; 525 | } 526 | case 'p': 527 | { 528 | max_partitions_to_scan = atoi(pArg + 2); 529 | if ((max_partitions_to_scan < 0) || (max_partitions_to_scan > BC7ENC16_MAX_PARTITIONS1)) 530 | { 531 | fprintf(stderr, "Invalid argument: %s\n", pArg); 532 | return EXIT_FAILURE; 533 | } 534 | break; 535 | } 536 | case 'k': 537 | { 538 | ktx = true; 539 | break; 540 | } 541 | default: 542 | { 543 | fprintf(stderr, "Invalid argument: %s\n", pArg); 544 | return EXIT_FAILURE; 545 | } 546 | } 547 | } 548 | else 549 | { 550 | if (!src_filename.size()) 551 | src_filename = pArg; 552 | else if (!dds_output_filename.size()) 553 | dds_output_filename = pArg; 554 | else if (!png_output_filename.size()) 555 | png_output_filename = pArg; 556 | else 557 | { 558 | fprintf(stderr, "Invalid argument: %s\n", pArg); 559 | return EXIT_FAILURE; 560 | } 561 | } 562 | } 563 | 564 | if (!src_filename.size()) 565 | { 566 | fprintf(stderr, "No source filename specified!\n"); 567 | return EXIT_FAILURE; 568 | } 569 | 570 | if (!dds_output_filename.size()) 571 | { 572 | dds_output_filename = src_filename; 573 | strip_extension(dds_output_filename); 574 | dds_output_filename += (ktx) ? ktxFileNameExt : ".dds"; 575 | } 576 | 577 | if (!png_output_filename.size()) 578 | { 579 | png_output_filename = src_filename; 580 | strip_extension(png_output_filename); 581 | png_output_filename += "_unpacked.png"; 582 | } 583 | 584 | png_alpha_output_filename = png_output_filename; 585 | strip_extension(png_alpha_output_filename); 586 | png_alpha_output_filename += "_unpacked_alpha.png"; 587 | 588 | image_u8 source_image; 589 | if (!load_png(src_filename.c_str(), source_image)) 590 | return EXIT_FAILURE; 591 | 592 | printf("Source image: %s %ux%u\n", src_filename.c_str(), source_image.width(), source_image.height()); 593 | 594 | if (src_alpha_filename.size()) 595 | { 596 | image_u8 source_alpha_image; 597 | if (!load_png(src_alpha_filename.c_str(), source_alpha_image)) 598 | return EXIT_FAILURE; 599 | 600 | printf("Source alpha image: %s %ux%u\n", src_alpha_filename.c_str(), source_alpha_image.width(), source_alpha_image.height()); 601 | 602 | const uint32_t w = std::min(source_alpha_image.width(), source_image.width()); 603 | const uint32_t h = std::min(source_alpha_image.height(), source_image.height()); 604 | 605 | for (uint32_t y = 0; y < h; y++) 606 | for (uint32_t x = 0; x < w; x++) 607 | source_image(x, y)[3] = source_alpha_image(x, y)[1]; 608 | } 609 | 610 | const uint32_t orig_width = source_image.width(); 611 | const uint32_t orig_height = source_image.height(); 612 | 613 | if (y_flip) 614 | { 615 | image_u8 temp; 616 | temp.init(orig_width, orig_height); 617 | 618 | for (uint32_t y = 0; y < orig_height; y++) 619 | for (uint32_t x = 0; x < orig_width; x++) 620 | temp(x, (orig_height - 1) - y) = source_image(x, y); 621 | 622 | temp.swap(source_image); 623 | } 624 | 625 | source_image.crop((source_image.width() + 3) & ~3, (source_image.height() + 3) & ~3); 626 | 627 | const uint32_t blocks_x = source_image.width() / 4; 628 | const uint32_t blocks_y = source_image.height() / 4; 629 | 630 | bc7_block_vec packed_image(blocks_x * blocks_y); 631 | 632 | bc7enc16_compress_block_params pack_params; 633 | bc7enc16_compress_block_params_init(&pack_params); 634 | if (!perceptual) 635 | bc7enc16_compress_block_params_init_linear_weights(&pack_params); 636 | pack_params.m_max_partitions_mode1 = max_partitions_to_scan; 637 | pack_params.m_uber_level = uber_level; 638 | 639 | printf("Max mode 1 partitions: %u, uber level: %u, perceptual: %u\n", pack_params.m_max_partitions_mode1, pack_params.m_uber_level, perceptual); 640 | 641 | bc7enc16_compress_block_init(); 642 | 643 | bool has_alpha = false; 644 | 645 | clock_t start_t = clock(); 646 | for (uint32_t by = 0; by < blocks_y; by++) 647 | { 648 | for (uint32_t bx = 0; bx < blocks_x; bx++) 649 | { 650 | color_quad_u8 pixels[16]; 651 | 652 | source_image.get_block(bx, by, 4, 4, pixels); 653 | 654 | bc7_block *pBlock = &packed_image[bx + by * blocks_x]; 655 | 656 | if (bc7enc16_compress_block(pBlock, pixels, &pack_params)) 657 | has_alpha = true; 658 | } 659 | 660 | if ((by & 63) == 0) 661 | { 662 | printf("."); 663 | fflush(stdout); 664 | } 665 | } 666 | 667 | clock_t end_t = clock(); 668 | 669 | printf("\nTotal time: %f secs\n", (double)(end_t - start_t) / CLOCKS_PER_SEC); 670 | 671 | if (has_alpha) 672 | printf("Source image had an alpha channel.\n"); 673 | 674 | bool failed = false; 675 | if (ktx) 676 | { 677 | if (!save_bc7_ktx(dds_output_filename.c_str(), orig_width, orig_height, &packed_image[0], perceptual, has_alpha)) 678 | failed = true; 679 | else 680 | printf("Wrote KTX file %s\n", dds_output_filename.c_str()); 681 | } 682 | else 683 | { 684 | if (!save_bc7_dds(dds_output_filename.c_str(), orig_width, orig_height, &packed_image[0], perceptual)) 685 | failed = true; 686 | else 687 | printf("Wrote DDS file %s\n", dds_output_filename.c_str()); 688 | } 689 | 690 | if ((!no_output_png) && (png_output_filename.size())) 691 | { 692 | image_u8 unpacked_image(source_image.width(), source_image.height()); 693 | 694 | for (uint32_t by = 0; by < blocks_y; by++) 695 | { 696 | for (uint32_t bx = 0; bx < blocks_x; bx++) 697 | { 698 | bc7_block *pBlock = &packed_image[bx + by * blocks_x]; 699 | 700 | color_quad_u8 unpacked_pixels[16]; 701 | detexDecompressBlockBPTC((const uint8_t *)pBlock, UINT32_MAX, 0, (uint8_t *)unpacked_pixels); 702 | 703 | unpacked_image.set_block(bx, by, 4, 4, unpacked_pixels); 704 | } 705 | } 706 | 707 | image_metrics y_metrics; 708 | y_metrics.compute(source_image, unpacked_image, 0, 0); 709 | printf("Luma Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", y_metrics.m_max, y_metrics.m_root_mean_squared, y_metrics.m_peak_snr); 710 | 711 | image_metrics rgb_metrics; 712 | rgb_metrics.compute(source_image, unpacked_image, 0, 3); 713 | printf("RGB Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", rgb_metrics.m_max, rgb_metrics.m_root_mean_squared, rgb_metrics.m_peak_snr); 714 | 715 | image_metrics rgba_metrics; 716 | rgba_metrics.compute(source_image, unpacked_image, 0, 4); 717 | printf("RGBA Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", rgba_metrics.m_max, rgba_metrics.m_root_mean_squared, rgba_metrics.m_peak_snr); 718 | 719 | image_metrics a_metrics; 720 | a_metrics.compute(source_image, unpacked_image, 3, 1); 721 | printf("Alpha Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", a_metrics.m_max, a_metrics.m_root_mean_squared, a_metrics.m_peak_snr); 722 | 723 | if (!save_png(png_output_filename.c_str(), unpacked_image, false)) 724 | failed = true; 725 | else 726 | printf("Wrote PNG file %s\n", png_output_filename.c_str()); 727 | 728 | //if ((png_alpha_output_filename.size()) && (has_alpha)) 729 | if (png_alpha_output_filename.size()) 730 | { 731 | image_u8 unpacked_image_alpha(unpacked_image); 732 | for (uint32_t y = 0; y < unpacked_image_alpha.height(); y++) 733 | for (uint32_t x = 0; x < unpacked_image_alpha.width(); x++) 734 | unpacked_image_alpha(x, y).set(unpacked_image_alpha(x, y)[3], 255); 735 | 736 | if (!save_png(png_alpha_output_filename.c_str(), unpacked_image_alpha, false)) 737 | failed = true; 738 | else 739 | printf("Wrote PNG file %s\n", png_alpha_output_filename.c_str()); 740 | } 741 | } 742 | 743 | return failed ? EXIT_FAILURE : EXIT_SUCCESS; 744 | } 745 | -------------------------------------------------------------------------------- /bc7enc16.c: -------------------------------------------------------------------------------- 1 | // File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file) 2 | #include "bc7enc16.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Helpers 9 | static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high; return value; } 10 | static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } 11 | static inline float saturate(float value) { return clampf(value, 0, 1.0f); } 12 | static inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; } 13 | static inline uint32_t minimumu(uint32_t a, uint32_t b) { return (a < b) ? a : b; } 14 | static inline float minimumf(float a, float b) { return (a < b) ? a : b; } 15 | static inline uint8_t maximumub(uint8_t a, uint8_t b) { return (a > b) ? a : b; } 16 | static inline uint32_t maximumu(uint32_t a, uint32_t b) { return (a > b) ? a : b; } 17 | static inline float maximumf(float a, float b) { return (a > b) ? a : b; } 18 | static inline int squarei(int i) { return i * i; } 19 | static inline float squaref(float i) { return i * i; } 20 | 21 | typedef struct { uint8_t m_c[4]; } color_quad_u8; 22 | typedef struct { float m_c[4]; } vec4F; 23 | 24 | static inline color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->m_c[0] = (uint8_t)clampi(r, 0, 255); pRes->m_c[1] = (uint8_t)clampi(g, 0, 255); pRes->m_c[2] = (uint8_t)clampi(b, 0, 255); pRes->m_c[3] = (uint8_t)clampi(a, 0, 255); return pRes; } 25 | static inline color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->m_c[0] = (uint8_t)r; pRes->m_c[1] = (uint8_t)g; pRes->m_c[2] = (uint8_t)b; pRes->m_c[3] = (uint8_t)a; return pRes; } 26 | static inline bc7enc16_bool color_quad_u8_notequals(const color_quad_u8 *pLHS, const color_quad_u8 *pRHS) { return (pLHS->m_c[0] != pRHS->m_c[0]) || (pLHS->m_c[1] != pRHS->m_c[1]) || (pLHS->m_c[2] != pRHS->m_c[2]) || (pLHS->m_c[3] != pRHS->m_c[3]); } 27 | static inline vec4F *vec4F_set_scalar(vec4F *pV, float x) { pV->m_c[0] = x; pV->m_c[1] = x; pV->m_c[2] = x; pV->m_c[3] = x; return pV; } 28 | static inline vec4F *vec4F_set(vec4F *pV, float x, float y, float z, float w) { pV->m_c[0] = x; pV->m_c[1] = y; pV->m_c[2] = z; pV->m_c[3] = w; return pV; } 29 | static inline vec4F *vec4F_saturate_in_place(vec4F *pV) { pV->m_c[0] = saturate(pV->m_c[0]); pV->m_c[1] = saturate(pV->m_c[1]); pV->m_c[2] = saturate(pV->m_c[2]); pV->m_c[3] = saturate(pV->m_c[3]); return pV; } 30 | static inline vec4F vec4F_saturate(const vec4F *pV) { vec4F res; res.m_c[0] = saturate(pV->m_c[0]); res.m_c[1] = saturate(pV->m_c[1]); res.m_c[2] = saturate(pV->m_c[2]); res.m_c[3] = saturate(pV->m_c[3]); return res; } 31 | static inline vec4F vec4F_from_color(const color_quad_u8 *pC) { vec4F res; vec4F_set(&res, pC->m_c[0], pC->m_c[1], pC->m_c[2], pC->m_c[3]); return res; } 32 | static inline vec4F vec4F_add(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] + pRHS->m_c[0], pLHS->m_c[1] + pRHS->m_c[1], pLHS->m_c[2] + pRHS->m_c[2], pLHS->m_c[3] + pRHS->m_c[3]); return res; } 33 | static inline vec4F vec4F_sub(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] - pRHS->m_c[0], pLHS->m_c[1] - pRHS->m_c[1], pLHS->m_c[2] - pRHS->m_c[2], pLHS->m_c[3] - pRHS->m_c[3]); return res; } 34 | static inline float vec4F_dot(const vec4F *pLHS, const vec4F *pRHS) { return pLHS->m_c[0] * pRHS->m_c[0] + pLHS->m_c[1] * pRHS->m_c[1] + pLHS->m_c[2] * pRHS->m_c[2] + pLHS->m_c[3] * pRHS->m_c[3]; } 35 | static inline vec4F vec4F_mul(const vec4F *pLHS, float s) { vec4F res; vec4F_set(&res, pLHS->m_c[0] * s, pLHS->m_c[1] * s, pLHS->m_c[2] * s, pLHS->m_c[3] * s); return res; } 36 | static inline vec4F *vec4F_normalize_in_place(vec4F *pV) { float s = pV->m_c[0] * pV->m_c[0] + pV->m_c[1] * pV->m_c[1] + pV->m_c[2] * pV->m_c[2] + pV->m_c[3] * pV->m_c[3]; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->m_c[0] *= s; pV->m_c[1] *= s; pV->m_c[2] *= s; pV->m_c[3] *= s; } return pV; } 37 | 38 | // Various BC7 tables 39 | static const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; 40 | static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; 41 | // Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w 42 | static const float g_bc7_weights3x[8 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f, 43 | 0.079102f, 0.718750f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f }; 44 | static const float g_bc7_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f, 45 | 0.451416f, 0.328125f, 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f, 46 | 0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f }; 47 | static const uint8_t g_bc7_partition1[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; 48 | static const uint8_t g_bc7_partition2[64 * 16] = 49 | { 50 | 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, 51 | 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, 52 | 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, 53 | 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, 54 | 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, 55 | 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, 56 | 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, 57 | 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 58 | }; 59 | static const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 }; 60 | static const uint8_t g_bc7_num_subsets[8] = { 3, 2, 3, 2, 1, 1, 1, 2 }; 61 | static const uint8_t g_bc7_partition_bits[8] = { 4, 6, 6, 6, 0, 0, 0, 6 }; 62 | static const uint8_t g_bc7_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 }; 63 | static int get_bc7_color_index_size(int mode, int index_selection_bit) { return g_bc7_color_index_bitcount[mode] + index_selection_bit; } 64 | static const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 }; 65 | static const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 }; 66 | static const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 }; 67 | static const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 }; 68 | 69 | typedef struct { uint16_t m_error; uint8_t m_lo; uint8_t m_hi; } endpoint_err; 70 | 71 | static endpoint_err g_bc7_mode_1_optimal_endpoints[256][2]; // [c][pbit] 72 | static const uint32_t BC7ENC16_MODE_1_OPTIMAL_INDEX = 2; 73 | 74 | // Initialize the lookup table used for optimal single color compression in mode 1. Must be called before encoding. 75 | void bc7enc16_compress_block_init() 76 | { 77 | for (int c = 0; c < 256; c++) 78 | { 79 | for (uint32_t lp = 0; lp < 2; lp++) 80 | { 81 | endpoint_err best; 82 | best.m_error = (uint16_t)UINT16_MAX; 83 | for (uint32_t l = 0; l < 64; l++) 84 | { 85 | uint32_t low = ((l << 1) | lp) << 1; 86 | low |= (low >> 7); 87 | for (uint32_t h = 0; h < 64; h++) 88 | { 89 | uint32_t high = ((h << 1) | lp) << 1; 90 | high |= (high >> 7); 91 | const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6; 92 | const int err = (k - c) * (k - c); 93 | if (err < best.m_error) 94 | { 95 | best.m_error = (uint16_t)err; 96 | best.m_lo = (uint8_t)l; 97 | best.m_hi = (uint8_t)h; 98 | } 99 | } // h 100 | } // l 101 | g_bc7_mode_1_optimal_endpoints[c][lp] = best; 102 | } // lp 103 | } // c 104 | } 105 | 106 | static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors) 107 | { 108 | // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 109 | // I did this in matrix form first, expanded out all the ops, then optimized it a bit. 110 | float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; 111 | float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; 112 | float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; 113 | float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; 114 | float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f; 115 | for (uint32_t i = 0; i < N; i++) 116 | { 117 | const uint32_t sel = pSelectors[i]; 118 | z00 += pSelector_weights[sel].m_c[0]; 119 | z10 += pSelector_weights[sel].m_c[1]; 120 | z11 += pSelector_weights[sel].m_c[2]; 121 | float w = pSelector_weights[sel].m_c[3]; 122 | q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; 123 | q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; 124 | q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; 125 | q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3]; 126 | } 127 | 128 | q10_r = t_r - q00_r; 129 | q10_g = t_g - q00_g; 130 | q10_b = t_b - q00_b; 131 | q10_a = t_a - q00_a; 132 | 133 | z01 = z10; 134 | 135 | float det = z00 * z11 - z01 * z10; 136 | if (det != 0.0f) 137 | det = 1.0f / det; 138 | 139 | float iz00, iz01, iz10, iz11; 140 | iz00 = z11 * det; 141 | iz01 = -z01 * det; 142 | iz10 = -z10 * det; 143 | iz11 = z00 * det; 144 | 145 | pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r); 146 | pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g); 147 | pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b); 148 | pXl->m_c[3] = (float)(iz00 * q00_a + iz01 * q10_a); pXh->m_c[3] = (float)(iz10 * q00_a + iz11 * q10_a); 149 | } 150 | 151 | static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors) 152 | { 153 | float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; 154 | float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; 155 | float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; 156 | float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; 157 | for (uint32_t i = 0; i < N; i++) 158 | { 159 | const uint32_t sel = pSelectors[i]; 160 | z00 += pSelector_weights[sel].m_c[0]; 161 | z10 += pSelector_weights[sel].m_c[1]; 162 | z11 += pSelector_weights[sel].m_c[2]; 163 | float w = pSelector_weights[sel].m_c[3]; 164 | q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; 165 | q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; 166 | q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; 167 | } 168 | 169 | q10_r = t_r - q00_r; 170 | q10_g = t_g - q00_g; 171 | q10_b = t_b - q00_b; 172 | 173 | z01 = z10; 174 | 175 | float det = z00 * z11 - z01 * z10; 176 | if (det != 0.0f) 177 | det = 1.0f / det; 178 | 179 | float iz00, iz01, iz10, iz11; 180 | iz00 = z11 * det; 181 | iz01 = -z01 * det; 182 | iz10 = -z10 * det; 183 | iz11 = z00 * det; 184 | 185 | pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r); 186 | pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g); 187 | pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b); 188 | pXl->m_c[3] = 255.0f; pXh->m_c[3] = 255.0f; 189 | } 190 | 191 | typedef struct 192 | { 193 | uint32_t m_num_pixels; 194 | const color_quad_u8 *m_pPixels; 195 | uint32_t m_num_selector_weights; 196 | const uint32_t *m_pSelector_weights; 197 | const vec4F *m_pSelector_weightsx; 198 | uint32_t m_comp_bits; 199 | uint32_t m_weights[4]; 200 | bc7enc16_bool m_has_alpha; 201 | bc7enc16_bool m_has_pbits; 202 | bc7enc16_bool m_endpoints_share_pbit; 203 | bc7enc16_bool m_perceptual; 204 | } color_cell_compressor_params; 205 | 206 | typedef struct 207 | { 208 | uint64_t m_best_overall_err; 209 | color_quad_u8 m_low_endpoint; 210 | color_quad_u8 m_high_endpoint; 211 | uint32_t m_pbits[2]; 212 | uint8_t *m_pSelectors; 213 | uint8_t *m_pSelectors_temp; 214 | } color_cell_compressor_results; 215 | 216 | static inline color_quad_u8 scale_color(const color_quad_u8 *pC, const color_cell_compressor_params *pParams) 217 | { 218 | color_quad_u8 results; 219 | 220 | const uint32_t n = pParams->m_comp_bits + (pParams->m_has_pbits ? 1 : 0); 221 | assert((n >= 4) && (n <= 8)); 222 | 223 | for (uint32_t i = 0; i < 4; i++) 224 | { 225 | uint32_t v = pC->m_c[i] << (8 - n); 226 | v |= (v >> n); 227 | assert(v <= 255); 228 | results.m_c[i] = (uint8_t)(v); 229 | } 230 | 231 | return results; 232 | } 233 | 234 | static inline uint64_t compute_color_distance_rgb(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc16_bool perceptual, const uint32_t weights[4]) 235 | { 236 | int dr, dg, db; 237 | 238 | if (perceptual) 239 | { 240 | const int l1 = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37; 241 | const int cr1 = ((int)pE1->m_c[0] << 9) - l1; 242 | const int cb1 = ((int)pE1->m_c[2] << 9) - l1; 243 | const int l2 = pE2->m_c[0] * 109 + pE2->m_c[1] * 366 + pE2->m_c[2] * 37; 244 | const int cr2 = ((int)pE2->m_c[0] << 9) - l2; 245 | const int cb2 = ((int)pE2->m_c[2] << 9) - l2; 246 | dr = (l1 - l2) >> 8; 247 | dg = (cr1 - cr2) >> 8; 248 | db = (cb1 - cb2) >> 8; 249 | } 250 | else 251 | { 252 | dr = (int)pE1->m_c[0] - (int)pE2->m_c[0]; 253 | dg = (int)pE1->m_c[1] - (int)pE2->m_c[1]; 254 | db = (int)pE1->m_c[2] - (int)pE2->m_c[2]; 255 | } 256 | 257 | return weights[0] * (uint32_t)(dr * dr) + weights[1] * (uint32_t)(dg * dg) + weights[2] * (uint32_t)(db * db); 258 | } 259 | 260 | static inline uint64_t compute_color_distance_rgba(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc16_bool perceptual, const uint32_t weights[4]) 261 | { 262 | int da = (int)pE1->m_c[3] - (int)pE2->m_c[3]; 263 | return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * (uint32_t)(da * da)); 264 | } 265 | 266 | static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors) 267 | { 268 | uint32_t best_err = UINT_MAX; 269 | uint32_t best_p = 0; 270 | 271 | for (uint32_t p = 0; p < 2; p++) 272 | { 273 | uint32_t err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error; 274 | if (err < best_err) 275 | { 276 | best_err = err; 277 | best_p = p; 278 | } 279 | } 280 | 281 | const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p]; 282 | const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p]; 283 | const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p]; 284 | 285 | color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0); 286 | color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0); 287 | pResults->m_pbits[0] = best_p; 288 | pResults->m_pbits[1] = 0; 289 | 290 | memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams->m_num_pixels); 291 | 292 | color_quad_u8 p; 293 | for (uint32_t i = 0; i < 3; i++) 294 | { 295 | uint32_t low = ((pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1; 296 | low |= (low >> 7); 297 | 298 | uint32_t high = ((pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1; 299 | high |= (high >> 7); 300 | 301 | p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6); 302 | } 303 | p.m_c[3] = 255; 304 | 305 | uint64_t total_err = 0; 306 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 307 | total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights); 308 | 309 | pResults->m_best_overall_err = total_err; 310 | 311 | return total_err; 312 | } 313 | 314 | static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) 315 | { 316 | color_quad_u8 quantMinColor = *pLow; 317 | color_quad_u8 quantMaxColor = *pHigh; 318 | 319 | if (pParams->m_has_pbits) 320 | { 321 | uint32_t minPBit, maxPBit; 322 | 323 | if (pParams->m_endpoints_share_pbit) 324 | maxPBit = minPBit = pbits[0]; 325 | else 326 | { 327 | minPBit = pbits[0]; 328 | maxPBit = pbits[1]; 329 | } 330 | 331 | quantMinColor.m_c[0] = (uint8_t)((pLow->m_c[0] << 1) | minPBit); 332 | quantMinColor.m_c[1] = (uint8_t)((pLow->m_c[1] << 1) | minPBit); 333 | quantMinColor.m_c[2] = (uint8_t)((pLow->m_c[2] << 1) | minPBit); 334 | quantMinColor.m_c[3] = (uint8_t)((pLow->m_c[3] << 1) | minPBit); 335 | 336 | quantMaxColor.m_c[0] = (uint8_t)((pHigh->m_c[0] << 1) | maxPBit); 337 | quantMaxColor.m_c[1] = (uint8_t)((pHigh->m_c[1] << 1) | maxPBit); 338 | quantMaxColor.m_c[2] = (uint8_t)((pHigh->m_c[2] << 1) | maxPBit); 339 | quantMaxColor.m_c[3] = (uint8_t)((pHigh->m_c[3] << 1) | maxPBit); 340 | } 341 | 342 | color_quad_u8 actualMinColor = scale_color(&quantMinColor, pParams); 343 | color_quad_u8 actualMaxColor = scale_color(&quantMaxColor, pParams); 344 | 345 | const uint32_t N = pParams->m_num_selector_weights; 346 | 347 | color_quad_u8 weightedColors[16]; 348 | weightedColors[0] = actualMinColor; 349 | weightedColors[N - 1] = actualMaxColor; 350 | 351 | const uint32_t nc = pParams->m_has_alpha ? 4 : 3; 352 | for (uint32_t i = 1; i < (N - 1); i++) 353 | for (uint32_t j = 0; j < nc; j++) 354 | weightedColors[i].m_c[j] = (uint8_t)((actualMinColor.m_c[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams->m_pSelector_weights[i] + 32) >> 6); 355 | 356 | const int lr = actualMinColor.m_c[0]; 357 | const int lg = actualMinColor.m_c[1]; 358 | const int lb = actualMinColor.m_c[2]; 359 | const int dr = actualMaxColor.m_c[0] - lr; 360 | const int dg = actualMaxColor.m_c[1] - lg; 361 | const int db = actualMaxColor.m_c[2] - lb; 362 | 363 | uint64_t total_err = 0; 364 | 365 | if (!pParams->m_perceptual) 366 | { 367 | if (pParams->m_has_alpha) 368 | { 369 | const int la = actualMinColor.m_c[3]; 370 | const int da = actualMaxColor.m_c[3] - la; 371 | 372 | const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f); 373 | 374 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 375 | { 376 | const color_quad_u8 *pC = &pParams->m_pPixels[i]; 377 | int r = pC->m_c[0]; 378 | int g = pC->m_c[1]; 379 | int b = pC->m_c[2]; 380 | int a = pC->m_c[3]; 381 | 382 | int best_sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f); 383 | best_sel = clampi(best_sel, 1, N - 1); 384 | 385 | uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams->m_weights); 386 | uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams->m_weights); 387 | 388 | if (err1 > err0) 389 | { 390 | err1 = err0; 391 | --best_sel; 392 | } 393 | total_err += err1; 394 | 395 | pResults->m_pSelectors_temp[i] = (uint8_t)best_sel; 396 | } 397 | } 398 | else 399 | { 400 | const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f); 401 | 402 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 403 | { 404 | const color_quad_u8 *pC = &pParams->m_pPixels[i]; 405 | int r = pC->m_c[0]; 406 | int g = pC->m_c[1]; 407 | int b = pC->m_c[2]; 408 | 409 | int sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f); 410 | sel = clampi(sel, 1, N - 1); 411 | 412 | uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams->m_weights); 413 | uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams->m_weights); 414 | 415 | int best_sel = sel; 416 | uint64_t best_err = err1; 417 | if (err0 < best_err) 418 | { 419 | best_err = err0; 420 | best_sel = sel - 1; 421 | } 422 | 423 | total_err += best_err; 424 | 425 | pResults->m_pSelectors_temp[i] = (uint8_t)best_sel; 426 | } 427 | } 428 | } 429 | else 430 | { 431 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 432 | { 433 | uint64_t best_err = UINT64_MAX; 434 | uint32_t best_sel = 0; 435 | 436 | if (pParams->m_has_alpha) 437 | { 438 | for (uint32_t j = 0; j < N; j++) 439 | { 440 | uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC16_TRUE, pParams->m_weights); 441 | if (err < best_err) 442 | { 443 | best_err = err; 444 | best_sel = j; 445 | } 446 | } 447 | } 448 | else 449 | { 450 | for (uint32_t j = 0; j < N; j++) 451 | { 452 | uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC16_TRUE, pParams->m_weights); 453 | if (err < best_err) 454 | { 455 | best_err = err; 456 | best_sel = j; 457 | } 458 | } 459 | } 460 | 461 | total_err += best_err; 462 | 463 | pResults->m_pSelectors_temp[i] = (uint8_t)best_sel; 464 | } 465 | } 466 | 467 | if (total_err < pResults->m_best_overall_err) 468 | { 469 | pResults->m_best_overall_err = total_err; 470 | 471 | pResults->m_low_endpoint = *pLow; 472 | pResults->m_high_endpoint = *pHigh; 473 | 474 | pResults->m_pbits[0] = pbits[0]; 475 | pResults->m_pbits[1] = pbits[1]; 476 | 477 | memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels); 478 | } 479 | 480 | return total_err; 481 | } 482 | 483 | static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, color_quad_u8 *pTrialMaxColor, const vec4F *pXl, const vec4F *pXh, uint32_t iscale) 484 | { 485 | if (mode == 1) 486 | { 487 | // fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps) 488 | for (uint32_t i = 0; i < 3; i++) 489 | { 490 | if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i]) 491 | { 492 | if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.0f) 493 | { 494 | if (pTrialMinColor->m_c[i] > (iscale >> 1)) 495 | { 496 | if (pTrialMinColor->m_c[i] > 0) 497 | pTrialMinColor->m_c[i]--; 498 | else 499 | if (pTrialMaxColor->m_c[i] < iscale) 500 | pTrialMaxColor->m_c[i]++; 501 | } 502 | else 503 | { 504 | if (pTrialMaxColor->m_c[i] < iscale) 505 | pTrialMaxColor->m_c[i]++; 506 | else if (pTrialMinColor->m_c[i] > 0) 507 | pTrialMinColor->m_c[i]--; 508 | } 509 | } 510 | } 511 | } 512 | } 513 | } 514 | 515 | static uint64_t find_optimal_solution(uint32_t mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) 516 | { 517 | vec4F_saturate_in_place(&xl); vec4F_saturate_in_place(&xh); 518 | 519 | if (pParams->m_has_pbits) 520 | { 521 | const int iscalep = (1 << (pParams->m_comp_bits + 1)) - 1; 522 | const float scalep = (float)iscalep; 523 | 524 | const int32_t totalComps = pParams->m_has_alpha ? 4 : 3; 525 | 526 | uint32_t best_pbits[2]; 527 | color_quad_u8 bestMinColor, bestMaxColor; 528 | 529 | if (!pParams->m_endpoints_share_pbit) 530 | { 531 | float best_err0 = 1e+9; 532 | float best_err1 = 1e+9; 533 | 534 | for (int p = 0; p < 2; p++) 535 | { 536 | color_quad_u8 xMinColor, xMaxColor; 537 | 538 | // Notes: The pbit controls which quantization intervals are selected. 539 | // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc. 540 | // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value 541 | // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5) 542 | // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5) 543 | for (uint32_t c = 0; c < 4; c++) 544 | { 545 | xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 546 | xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 547 | } 548 | 549 | color_quad_u8 scaledLow = scale_color(&xMinColor, pParams); 550 | color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams); 551 | 552 | float err0 = 0, err1 = 0; 553 | for (int i = 0; i < totalComps; i++) 554 | { 555 | err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f); 556 | err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f); 557 | } 558 | 559 | if (err0 < best_err0) 560 | { 561 | best_err0 = err0; 562 | best_pbits[0] = p; 563 | 564 | bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1; 565 | bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1; 566 | bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1; 567 | bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1; 568 | } 569 | 570 | if (err1 < best_err1) 571 | { 572 | best_err1 = err1; 573 | best_pbits[1] = p; 574 | 575 | bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1; 576 | bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1; 577 | bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1; 578 | bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1; 579 | } 580 | } 581 | } 582 | else 583 | { 584 | // Endpoints share pbits 585 | float best_err = 1e+9; 586 | 587 | for (int p = 0; p < 2; p++) 588 | { 589 | color_quad_u8 xMinColor, xMaxColor; 590 | for (uint32_t c = 0; c < 4; c++) 591 | { 592 | xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 593 | xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); 594 | } 595 | 596 | color_quad_u8 scaledLow = scale_color(&xMinColor, pParams); 597 | color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams); 598 | 599 | float err = 0; 600 | for (int i = 0; i < totalComps; i++) 601 | err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]); 602 | 603 | if (err < best_err) 604 | { 605 | best_err = err; 606 | best_pbits[0] = p; 607 | best_pbits[1] = p; 608 | for (uint32_t j = 0; j < 4; j++) 609 | { 610 | bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; 611 | bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; 612 | } 613 | } 614 | } 615 | } 616 | 617 | fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1); 618 | 619 | if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&bestMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&bestMaxColor, &pResults->m_high_endpoint) || (best_pbits[0] != pResults->m_pbits[0]) || (best_pbits[1] != pResults->m_pbits[1])) 620 | evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults); 621 | } 622 | else 623 | { 624 | const int iscale = (1 << pParams->m_comp_bits) - 1; 625 | const float scale = (float)iscale; 626 | 627 | color_quad_u8 trialMinColor, trialMaxColor; 628 | color_quad_u8_set_clamped(&trialMinColor, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f)); 629 | color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f)); 630 | 631 | fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale); 632 | 633 | if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint)) 634 | evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults); 635 | } 636 | 637 | return pResults->m_best_overall_err; 638 | } 639 | 640 | static uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, const bc7enc16_compress_block_params *pComp_params) 641 | { 642 | assert((mode == 6) || (!pParams->m_has_alpha)); 643 | 644 | pResults->m_best_overall_err = UINT64_MAX; 645 | 646 | // If the partition's colors are all the same in mode 1, then just pack them as a single color. 647 | if (mode == 1) 648 | { 649 | const uint32_t cr = pParams->m_pPixels[0].m_c[0], cg = pParams->m_pPixels[0].m_c[1], cb = pParams->m_pPixels[0].m_c[2]; 650 | 651 | bc7enc16_bool allSame = BC7ENC16_TRUE; 652 | for (uint32_t i = 1; i < pParams->m_num_pixels; i++) 653 | { 654 | if ((cr != pParams->m_pPixels[i].m_c[0]) || (cg != pParams->m_pPixels[i].m_c[1]) || (cb != pParams->m_pPixels[i].m_c[2])) 655 | { 656 | allSame = BC7ENC16_FALSE; 657 | break; 658 | } 659 | } 660 | 661 | if (allSame) 662 | return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors); 663 | } 664 | 665 | // Compute partition's mean color and principle axis. 666 | vec4F meanColor, axis; 667 | vec4F_set_scalar(&meanColor, 0.0f); 668 | 669 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 670 | { 671 | vec4F color = vec4F_from_color(&pParams->m_pPixels[i]); 672 | meanColor = vec4F_add(&meanColor, &color); 673 | } 674 | 675 | vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels)); 676 | 677 | meanColor = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels * 255.0f)); 678 | vec4F_saturate_in_place(&meanColor); 679 | 680 | if (pParams->m_has_alpha) 681 | { 682 | // Use incremental PCA for RGBA PCA, because it's simple. 683 | vec4F_set_scalar(&axis, 0.0f); 684 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 685 | { 686 | vec4F color = vec4F_from_color(&pParams->m_pPixels[i]); 687 | color = vec4F_sub(&color, &meanColorScaled); 688 | vec4F a = vec4F_mul(&color, color.m_c[0]); 689 | vec4F b = vec4F_mul(&color, color.m_c[1]); 690 | vec4F c = vec4F_mul(&color, color.m_c[2]); 691 | vec4F d = vec4F_mul(&color, color.m_c[3]); 692 | vec4F n = i ? axis : color; 693 | vec4F_normalize_in_place(&n); 694 | axis.m_c[0] += vec4F_dot(&a, &n); 695 | axis.m_c[1] += vec4F_dot(&b, &n); 696 | axis.m_c[2] += vec4F_dot(&c, &n); 697 | axis.m_c[3] += vec4F_dot(&d, &n); 698 | } 699 | vec4F_normalize_in_place(&axis); 700 | } 701 | else 702 | { 703 | // Use covar technique for RGB PCA, because it doesn't require per-pixel normalization. 704 | float cov[6] = { 0, 0, 0, 0, 0, 0 }; 705 | 706 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 707 | { 708 | const color_quad_u8 *pV = &pParams->m_pPixels[i]; 709 | float r = pV->m_c[0] - meanColorScaled.m_c[0]; 710 | float g = pV->m_c[1] - meanColorScaled.m_c[1]; 711 | float b = pV->m_c[2] - meanColorScaled.m_c[2]; 712 | cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b; 713 | } 714 | 715 | float vfr = .9f, vfg = 1.0f, vfb = .7f; 716 | for (uint32_t iter = 0; iter < 3; iter++) 717 | { 718 | float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2]; 719 | float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4]; 720 | float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5]; 721 | 722 | float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); 723 | if (m > 1e-10f) 724 | { 725 | m = 1.0f / m; 726 | r *= m; g *= m; b *= m; 727 | } 728 | 729 | vfr = r; vfg = g; vfb = b; 730 | } 731 | 732 | float len = vfr*vfr + vfg*vfg + vfb*vfb; 733 | if (len < 1e-10f) 734 | vec4F_set_scalar(&axis, 0.0f); 735 | else 736 | { 737 | len = 1.0f / sqrtf(len); 738 | vfr *= len; vfg *= len; vfb *= len; 739 | vec4F_set(&axis, vfr, vfg, vfb, 0); 740 | } 741 | } 742 | 743 | if (vec4F_dot(&axis, &axis) < .5f) 744 | { 745 | if (pParams->m_perceptual) 746 | vec4F_set(&axis, .213f, .715f, .072f, pParams->m_has_alpha ? .715f : 0); 747 | else 748 | vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams->m_has_alpha ? 1.0f : 0); 749 | vec4F_normalize_in_place(&axis); 750 | } 751 | 752 | float l = 1e+9f, h = -1e+9f; 753 | 754 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 755 | { 756 | vec4F color = vec4F_from_color(&pParams->m_pPixels[i]); 757 | 758 | vec4F q = vec4F_sub(&color, &meanColorScaled); 759 | float d = vec4F_dot(&q, &axis); 760 | 761 | l = minimumf(l, d); 762 | h = maximumf(h, d); 763 | } 764 | 765 | l *= (1.0f / 255.0f); 766 | h *= (1.0f / 255.0f); 767 | 768 | vec4F b0 = vec4F_mul(&axis, l); 769 | vec4F b1 = vec4F_mul(&axis, h); 770 | vec4F c0 = vec4F_add(&meanColor, &b0); 771 | vec4F c1 = vec4F_add(&meanColor, &b1); 772 | vec4F minColor = vec4F_saturate(&c0); 773 | vec4F maxColor = vec4F_saturate(&c1); 774 | 775 | vec4F whiteVec; 776 | vec4F_set_scalar(&whiteVec, 1.0f); 777 | if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec)) 778 | { 779 | vec4F temp = minColor; 780 | minColor = maxColor; 781 | maxColor = temp; 782 | } 783 | // First find a solution using the block's PCA. 784 | if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults)) 785 | return 0; 786 | 787 | if (pComp_params->m_try_least_squares) 788 | { 789 | // Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors. 790 | vec4F xl, xh; 791 | vec4F_set_scalar(&xl, 0.0f); 792 | vec4F_set_scalar(&xh, 0.0f); 793 | if (pParams->m_has_alpha) 794 | compute_least_squares_endpoints_rgba(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 795 | else 796 | compute_least_squares_endpoints_rgb(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 797 | 798 | xl = vec4F_mul(&xl, (1.0f / 255.0f)); 799 | xh = vec4F_mul(&xh, (1.0f / 255.0f)); 800 | 801 | if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 802 | return 0; 803 | } 804 | 805 | if (pComp_params->m_uber_level > 0) 806 | { 807 | // In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors, 808 | // then try decrementing the selectrors, then try both. 809 | uint8_t selectors_temp[16], selectors_temp1[16]; 810 | memcpy(selectors_temp, pResults->m_pSelectors, pParams->m_num_pixels); 811 | 812 | const int max_selector = pParams->m_num_selector_weights - 1; 813 | 814 | uint32_t min_sel = 16; 815 | uint32_t max_sel = 0; 816 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 817 | { 818 | uint32_t sel = selectors_temp[i]; 819 | min_sel = minimumu(min_sel, sel); 820 | max_sel = maximumu(max_sel, sel); 821 | } 822 | 823 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 824 | { 825 | uint32_t sel = selectors_temp[i]; 826 | if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1))) 827 | sel++; 828 | selectors_temp1[i] = (uint8_t)sel; 829 | } 830 | 831 | vec4F xl, xh; 832 | vec4F_set_scalar(&xl, 0.0f); 833 | vec4F_set_scalar(&xh, 0.0f); 834 | if (pParams->m_has_alpha) 835 | compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 836 | else 837 | compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 838 | 839 | xl = vec4F_mul(&xl, (1.0f / 255.0f)); 840 | xh = vec4F_mul(&xh, (1.0f / 255.0f)); 841 | 842 | if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 843 | return 0; 844 | 845 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 846 | { 847 | uint32_t sel = selectors_temp[i]; 848 | if ((sel == max_sel) && (sel > 0)) 849 | sel--; 850 | selectors_temp1[i] = (uint8_t)sel; 851 | } 852 | 853 | if (pParams->m_has_alpha) 854 | compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 855 | else 856 | compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 857 | 858 | xl = vec4F_mul(&xl, (1.0f / 255.0f)); 859 | xh = vec4F_mul(&xh, (1.0f / 255.0f)); 860 | 861 | if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 862 | return 0; 863 | 864 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 865 | { 866 | uint32_t sel = selectors_temp[i]; 867 | if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1))) 868 | sel++; 869 | else if ((sel == max_sel) && (sel > 0)) 870 | sel--; 871 | selectors_temp1[i] = (uint8_t)sel; 872 | } 873 | 874 | if (pParams->m_has_alpha) 875 | compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 876 | else 877 | compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 878 | 879 | xl = vec4F_mul(&xl, (1.0f / 255.0f)); 880 | xh = vec4F_mul(&xh, (1.0f / 255.0f)); 881 | 882 | if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 883 | return 0; 884 | 885 | // In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another. 886 | const uint32_t uber_err_thresh = (pParams->m_num_pixels * 56) >> 4; 887 | if ((pComp_params->m_uber_level >= 2) && (pResults->m_best_overall_err > uber_err_thresh)) 888 | { 889 | const int Q = (pComp_params->m_uber_level >= 4) ? (pComp_params->m_uber_level - 2) : 1; 890 | for (int ly = -Q; ly <= 1; ly++) 891 | { 892 | for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++) 893 | { 894 | if ((ly == 0) && (hy == max_selector)) 895 | continue; 896 | 897 | for (uint32_t i = 0; i < pParams->m_num_pixels; i++) 898 | selectors_temp1[i] = (uint8_t)clampf(floorf((float)max_selector * ((float)selectors_temp[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_selector); 899 | 900 | //vec4F xl, xh; 901 | vec4F_set_scalar(&xl, 0.0f); 902 | vec4F_set_scalar(&xh, 0.0f); 903 | if (pParams->m_has_alpha) 904 | compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 905 | else 906 | compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels); 907 | 908 | xl = vec4F_mul(&xl, (1.0f / 255.0f)); 909 | xh = vec4F_mul(&xh, (1.0f / 255.0f)); 910 | 911 | if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) 912 | return 0; 913 | } 914 | } 915 | } 916 | } 917 | 918 | if (mode == 1) 919 | { 920 | // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean. 921 | color_cell_compressor_results avg_results = *pResults; 922 | const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f); 923 | uint64_t avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp); 924 | if (avg_err < pResults->m_best_overall_err) 925 | { 926 | *pResults = avg_results; 927 | memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels); 928 | pResults->m_best_overall_err = avg_err; 929 | } 930 | } 931 | 932 | return pResults->m_best_overall_err; 933 | } 934 | 935 | static uint64_t color_cell_compression_est(uint32_t num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far) 936 | { 937 | // Find RGB bounds as an approximation of the block's principle axis 938 | uint32_t lr = 255, lg = 255, lb = 255; 939 | uint32_t hr = 0, hg = 0, hb = 0; 940 | for (uint32_t i = 0; i < num_pixels; i++) 941 | { 942 | const color_quad_u8 *pC = &pPixels[i]; 943 | if (pC->m_c[0] < lr) lr = pC->m_c[0]; 944 | if (pC->m_c[1] < lg) lg = pC->m_c[1]; 945 | if (pC->m_c[2] < lb) lb = pC->m_c[2]; 946 | if (pC->m_c[0] > hr) hr = pC->m_c[0]; 947 | if (pC->m_c[1] > hg) hg = pC->m_c[1]; 948 | if (pC->m_c[2] > hb) hb = pC->m_c[2]; 949 | } 950 | 951 | color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0); 952 | color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0); 953 | 954 | // Place endpoints at bbox diagonals and compute interpolated colors 955 | const uint32_t N = 8; 956 | color_quad_u8 weightedColors[8]; 957 | 958 | weightedColors[0] = lowColor; 959 | weightedColors[N - 1] = highColor; 960 | for (uint32_t i = 1; i < (N - 1); i++) 961 | { 962 | weightedColors[i].m_c[0] = (uint8_t)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6); 963 | weightedColors[i].m_c[1] = (uint8_t)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6); 964 | weightedColors[i].m_c[2] = (uint8_t)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6); 965 | } 966 | 967 | // Compute dots and thresholds 968 | const int ar = highColor.m_c[0] - lowColor.m_c[0]; 969 | const int ag = highColor.m_c[1] - lowColor.m_c[1]; 970 | const int ab = highColor.m_c[2] - lowColor.m_c[2]; 971 | 972 | int dots[8]; 973 | for (uint32_t i = 0; i < N; i++) 974 | dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab; 975 | 976 | int thresh[8 - 1]; 977 | for (uint32_t i = 0; i < (N - 1); i++) 978 | thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1; 979 | 980 | uint64_t total_err = 0; 981 | if (perceptual) 982 | { 983 | // Transform block's interpolated colors to YCbCr 984 | int l1[8], cr1[8], cb1[8]; 985 | for (int j = 0; j < 8; j++) 986 | { 987 | const color_quad_u8 *pE1 = &weightedColors[j]; 988 | l1[j] = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37; 989 | cr1[j] = ((int)pE1->m_c[0] << 9) - l1[j]; 990 | cb1[j] = ((int)pE1->m_c[2] << 9) - l1[j]; 991 | } 992 | 993 | for (uint32_t i = 0; i < num_pixels; i++) 994 | { 995 | const color_quad_u8 *pC = &pPixels[i]; 996 | 997 | int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2]; 998 | 999 | // Find approximate selector 1000 | uint32_t s = 0; 1001 | if (d >= thresh[6]) 1002 | s = 7; 1003 | else if (d >= thresh[5]) 1004 | s = 6; 1005 | else if (d >= thresh[4]) 1006 | s = 5; 1007 | else if (d >= thresh[3]) 1008 | s = 4; 1009 | else if (d >= thresh[2]) 1010 | s = 3; 1011 | else if (d >= thresh[1]) 1012 | s = 2; 1013 | else if (d >= thresh[0]) 1014 | s = 1; 1015 | 1016 | // Compute error 1017 | const int l2 = pC->m_c[0] * 109 + pC->m_c[1] * 366 + pC->m_c[2] * 37; 1018 | const int cr2 = ((int)pC->m_c[0] << 9) - l2; 1019 | const int cb2 = ((int)pC->m_c[2] << 9) - l2; 1020 | 1021 | const int dl = (l1[s] - l2) >> 8; 1022 | const int dcr = (cr1[s] - cr2) >> 8; 1023 | const int dcb = (cb1[s] - cb2) >> 8; 1024 | 1025 | int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb); 1026 | 1027 | total_err += ie; 1028 | if (total_err > best_err_so_far) 1029 | break; 1030 | } 1031 | } 1032 | else 1033 | { 1034 | for (uint32_t i = 0; i < num_pixels; i++) 1035 | { 1036 | const color_quad_u8 *pC = &pPixels[i]; 1037 | 1038 | int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2]; 1039 | 1040 | // Find approximate selector 1041 | uint32_t s = 0; 1042 | if (d >= thresh[6]) 1043 | s = 7; 1044 | else if (d >= thresh[5]) 1045 | s = 6; 1046 | else if (d >= thresh[4]) 1047 | s = 5; 1048 | else if (d >= thresh[3]) 1049 | s = 4; 1050 | else if (d >= thresh[2]) 1051 | s = 3; 1052 | else if (d >= thresh[1]) 1053 | s = 2; 1054 | else if (d >= thresh[0]) 1055 | s = 1; 1056 | 1057 | // Compute error 1058 | const color_quad_u8 *pE1 = &weightedColors[s]; 1059 | 1060 | int dr = (int)pE1->m_c[0] - (int)pC->m_c[0]; 1061 | int dg = (int)pE1->m_c[1] - (int)pC->m_c[1]; 1062 | int db = (int)pE1->m_c[2] - (int)pC->m_c[2]; 1063 | 1064 | total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db); 1065 | if (total_err > best_err_so_far) 1066 | break; 1067 | } 1068 | } 1069 | 1070 | return total_err; 1071 | } 1072 | 1073 | // This table contains bitmasks indicating which "key" partitions must be best ranked before this partition is worth evaluating. 1074 | // We first rank the best/most used 14 partitions (sorted by usefulness), record the best one found as the key partition, then use 1075 | // that to control the other partitions to evaluate. The quality loss is ~.08 dB RGB PSNR, the perf gain is up to ~11% (at uber level 0). 1076 | static const uint32_t g_partition_predictors[35] = 1077 | { 1078 | UINT32_MAX, 1079 | UINT32_MAX, 1080 | UINT32_MAX, 1081 | UINT32_MAX, 1082 | UINT32_MAX, 1083 | (1 << 1) | (1 << 2) | (1 << 8), 1084 | (1 << 1) | (1 << 3) | (1 << 7), 1085 | UINT32_MAX, 1086 | UINT32_MAX, 1087 | (1 << 2) | (1 << 8) | (1 << 16), 1088 | (1 << 7) | (1 << 3) | (1 << 15), 1089 | UINT32_MAX, 1090 | (1 << 8) | (1 << 14) | (1 << 16), 1091 | (1 << 7) | (1 << 14) | (1 << 15), 1092 | UINT32_MAX, 1093 | UINT32_MAX, 1094 | UINT32_MAX, 1095 | UINT32_MAX, 1096 | (1 << 14) | (1 << 15), 1097 | (1 << 16) | (1 << 22) | (1 << 14), 1098 | (1 << 17) | (1 << 24) | (1 << 14), 1099 | (1 << 2) | (1 << 14) | (1 << 15) | (1 << 1), 1100 | UINT32_MAX, 1101 | (1 << 1) | (1 << 3) | (1 << 14) | (1 << 16) | (1 << 22), 1102 | UINT32_MAX, 1103 | (1 << 1) | (1 << 2) | (1 << 15) | (1 << 17) | (1 << 24), 1104 | (1 << 1) | (1 << 3) | (1 << 22), 1105 | UINT32_MAX, 1106 | UINT32_MAX, 1107 | UINT32_MAX, 1108 | (1 << 14) | (1 << 15) | (1 << 16) | (1 << 17), 1109 | UINT32_MAX, 1110 | UINT32_MAX, 1111 | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 27) | (1 << 4) | (1 << 24), 1112 | (1 << 14) | (1 << 15) | (1 << 16) | (1 << 11) | (1 << 17) | (1 << 27) 1113 | }; 1114 | 1115 | // Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each. 1116 | static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, uint32_t pweights[4]) 1117 | { 1118 | const uint32_t total_partitions = minimumu(pComp_params->m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1); 1119 | if (total_partitions <= 1) 1120 | return 0; 1121 | 1122 | uint64_t best_err = UINT64_MAX; 1123 | uint32_t best_partition = 0; 1124 | 1125 | // Partition order sorted by usage frequency across a large test corpus. Pattern 34 (checkerboard) must appear in slot 34. 1126 | // Using a sorted order allows the user to decrease the # of partitions to scan with minimal loss in quality. 1127 | static const uint8_t s_sorted_partition_order[64] = 1128 | { 1129 | 1 - 1, 14 - 1, 2 - 1, 3 - 1, 16 - 1, 15 - 1, 11 - 1, 17 - 1, 1130 | 4 - 1, 24 - 1, 27 - 1, 7 - 1, 8 - 1, 22 - 1, 20 - 1, 30 - 1, 1131 | 9 - 1, 5 - 1, 10 - 1, 21 - 1, 6 - 1, 32 - 1, 23 - 1, 18 - 1, 1132 | 19 - 1, 12 - 1, 13 - 1, 31 - 1, 25 - 1, 26 - 1, 29 - 1, 28 - 1, 1133 | 33 - 1, 34 - 1, 35 - 1, 46 - 1, 47 - 1, 52 - 1, 50 - 1, 51 - 1, 1134 | 49 - 1, 39 - 1, 40 - 1, 38 - 1, 54 - 1, 53 - 1, 55 - 1, 37 - 1, 1135 | 58 - 1, 59 - 1, 56 - 1, 42 - 1, 41 - 1, 43 - 1, 44 - 1, 60 - 1, 1136 | 45 - 1, 57 - 1, 48 - 1, 36 - 1, 61 - 1, 64 - 1, 63 - 1, 62 - 1 1137 | }; 1138 | 1139 | assert(s_sorted_partition_order[34] == 34); 1140 | 1141 | int best_key_partition = 0; 1142 | 1143 | for (uint32_t partition_iter = 0; (partition_iter < total_partitions) && (best_err > 0); partition_iter++) 1144 | { 1145 | const uint32_t partition = s_sorted_partition_order[partition_iter]; 1146 | 1147 | // Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14. 1148 | if (pComp_params->m_mode1_partition_estimation_filterbank) 1149 | { 1150 | if ((partition_iter >= 14) && (partition_iter <= 34)) 1151 | { 1152 | const uint32_t best_key_partition_bitmask = 1 << (best_key_partition + 1); 1153 | if ((g_partition_predictors[partition] & best_key_partition_bitmask) == 0) 1154 | { 1155 | if (partition_iter == 34) 1156 | break; 1157 | 1158 | continue; 1159 | } 1160 | } 1161 | } 1162 | 1163 | const uint8_t *pPartition = &g_bc7_partition2[partition * 16]; 1164 | 1165 | color_quad_u8 subset_colors[2][16]; 1166 | uint32_t subset_total_colors[2] = { 0, 0 }; 1167 | for (uint32_t index = 0; index < 16; index++) 1168 | subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index]; 1169 | 1170 | uint64_t total_subset_err = 0; 1171 | for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++) 1172 | total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params->m_perceptual, pweights, best_err); 1173 | 1174 | if (total_subset_err < best_err) 1175 | { 1176 | best_err = total_subset_err; 1177 | best_partition = partition; 1178 | } 1179 | 1180 | // If the checkerboard pattern doesn't get the highest ranking vs. the previous (lower frequency) patterns, then just stop now because statistically the subsequent patterns won't do well either. 1181 | if ((partition == 34) && (best_partition != 34)) 1182 | break; 1183 | 1184 | if (partition_iter == 13) 1185 | best_key_partition = best_partition; 1186 | 1187 | } // partition 1188 | 1189 | return best_partition; 1190 | } 1191 | 1192 | static void set_block_bits(uint8_t *pBytes, uint32_t val, uint32_t num_bits, uint32_t *pCur_ofs) 1193 | { 1194 | assert((num_bits <= 32) && (val < (1ULL << num_bits))); 1195 | while (num_bits) 1196 | { 1197 | const uint32_t n = minimumu(8 - (*pCur_ofs & 7), num_bits); 1198 | pBytes[*pCur_ofs >> 3] |= (uint8_t)(val << (*pCur_ofs & 7)); 1199 | val >>= n; 1200 | num_bits -= n; 1201 | *pCur_ofs += n; 1202 | } 1203 | assert(*pCur_ofs <= 128); 1204 | } 1205 | 1206 | typedef struct 1207 | { 1208 | uint32_t m_mode; 1209 | uint32_t m_partition; 1210 | uint8_t m_selectors[16]; 1211 | color_quad_u8 m_low[2]; 1212 | color_quad_u8 m_high[2]; 1213 | uint32_t m_pbits[2][2]; 1214 | } bc7_optimization_results; 1215 | 1216 | static void encode_bc7_block(void *pBlock, const bc7_optimization_results *pResults) 1217 | { 1218 | const uint32_t best_mode = pResults->m_mode; 1219 | const uint32_t total_subsets = g_bc7_num_subsets[best_mode]; 1220 | const uint32_t total_partitions = 1 << g_bc7_partition_bits[best_mode]; 1221 | const uint8_t *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults->m_partition * 16] : &g_bc7_partition1[0]; 1222 | 1223 | uint8_t color_selectors[16]; 1224 | memcpy(color_selectors, pResults->m_selectors, 16); 1225 | 1226 | color_quad_u8 low[2], high[2]; 1227 | memcpy(low, pResults->m_low, sizeof(low)); 1228 | memcpy(high, pResults->m_high, sizeof(high)); 1229 | 1230 | uint32_t pbits[2][2]; 1231 | memcpy(pbits, pResults->m_pbits, sizeof(pbits)); 1232 | 1233 | int anchor[2] = { -1, -1 }; 1234 | 1235 | for (uint32_t k = 0; k < total_subsets; k++) 1236 | { 1237 | const uint32_t anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults->m_partition] : 0; 1238 | anchor[k] = anchor_index; 1239 | 1240 | const uint32_t color_index_bits = get_bc7_color_index_size(best_mode, 0); 1241 | const uint32_t num_color_indices = 1 << color_index_bits; 1242 | 1243 | if (color_selectors[anchor_index] & (num_color_indices >> 1)) 1244 | { 1245 | for (uint32_t i = 0; i < 16; i++) 1246 | if (pPartition[i] == k) 1247 | color_selectors[i] = (uint8_t)((num_color_indices - 1) - color_selectors[i]); 1248 | 1249 | color_quad_u8 tmp = low[k]; 1250 | low[k] = high[k]; 1251 | high[k] = tmp; 1252 | 1253 | if (!g_bc7_mode_has_shared_p_bits[best_mode]) 1254 | { 1255 | uint32_t t = pbits[k][0]; 1256 | pbits[k][0] = pbits[k][1]; 1257 | pbits[k][1] = t; 1258 | } 1259 | } 1260 | } 1261 | 1262 | uint8_t *pBlock_bytes = (uint8_t *)(pBlock); 1263 | memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE); 1264 | 1265 | uint32_t cur_bit_ofs = 0; 1266 | set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs); 1267 | 1268 | if (total_partitions > 1) 1269 | set_block_bits(pBlock_bytes, pResults->m_partition, 6, &cur_bit_ofs); 1270 | 1271 | const uint32_t total_comps = (best_mode >= 4) ? 4 : 3; 1272 | for (uint32_t comp = 0; comp < total_comps; comp++) 1273 | { 1274 | for (uint32_t subset = 0; subset < total_subsets; subset++) 1275 | { 1276 | set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); 1277 | set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); 1278 | } 1279 | } 1280 | 1281 | for (uint32_t subset = 0; subset < total_subsets; subset++) 1282 | { 1283 | set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs); 1284 | if (!g_bc7_mode_has_shared_p_bits[best_mode]) 1285 | set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs); 1286 | } 1287 | 1288 | for (int idx = 0; idx < 16; idx++) 1289 | { 1290 | uint32_t n = get_bc7_color_index_size(best_mode, 0); 1291 | if ((idx == anchor[0]) || (idx == anchor[1])) 1292 | n--; 1293 | set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs); 1294 | } 1295 | 1296 | assert(cur_bit_ofs == 128); 1297 | } 1298 | 1299 | static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, color_cell_compressor_params *pParams) 1300 | { 1301 | color_cell_compressor_results results6; 1302 | 1303 | pParams->m_pSelector_weights = g_bc7_weights4; 1304 | pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x; 1305 | pParams->m_num_selector_weights = 16; 1306 | pParams->m_comp_bits = 7; 1307 | pParams->m_has_pbits = BC7ENC16_TRUE; 1308 | pParams->m_has_alpha = BC7ENC16_TRUE; 1309 | pParams->m_perceptual = pComp_params->m_perceptual; 1310 | pParams->m_num_pixels = 16; 1311 | pParams->m_pPixels = pPixels; 1312 | 1313 | bc7_optimization_results opt_results; 1314 | results6.m_pSelectors = opt_results.m_selectors; 1315 | 1316 | uint8_t selectors_temp[16]; 1317 | results6.m_pSelectors_temp = selectors_temp; 1318 | 1319 | color_cell_compression(6, pParams, &results6, pComp_params); 1320 | 1321 | opt_results.m_mode = 6; 1322 | opt_results.m_partition = 0; 1323 | opt_results.m_low[0] = results6.m_low_endpoint; 1324 | opt_results.m_high[0] = results6.m_high_endpoint; 1325 | opt_results.m_pbits[0][0] = results6.m_pbits[0]; 1326 | opt_results.m_pbits[0][1] = results6.m_pbits[1]; 1327 | 1328 | encode_bc7_block(pBlock, &opt_results); 1329 | } 1330 | 1331 | static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, color_cell_compressor_params *pParams) 1332 | { 1333 | uint8_t selectors_temp[16]; 1334 | 1335 | // Mode 6 1336 | bc7_optimization_results opt_results; 1337 | 1338 | pParams->m_pSelector_weights = g_bc7_weights4; 1339 | pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x; 1340 | pParams->m_num_selector_weights = 16; 1341 | pParams->m_comp_bits = 7; 1342 | pParams->m_has_pbits = BC7ENC16_TRUE; 1343 | pParams->m_endpoints_share_pbit = BC7ENC16_FALSE; 1344 | pParams->m_perceptual = pComp_params->m_perceptual; 1345 | pParams->m_num_pixels = 16; 1346 | pParams->m_pPixels = pPixels; 1347 | pParams->m_has_alpha = BC7ENC16_FALSE; 1348 | 1349 | color_cell_compressor_results results6; 1350 | results6.m_pSelectors = opt_results.m_selectors; 1351 | results6.m_pSelectors_temp = selectors_temp; 1352 | 1353 | uint64_t best_err = color_cell_compression(6, pParams, &results6, pComp_params); 1354 | 1355 | opt_results.m_mode = 6; 1356 | opt_results.m_partition = 0; 1357 | opt_results.m_low[0] = results6.m_low_endpoint; 1358 | opt_results.m_high[0] = results6.m_high_endpoint; 1359 | opt_results.m_pbits[0][0] = results6.m_pbits[0]; 1360 | opt_results.m_pbits[0][1] = results6.m_pbits[1]; 1361 | 1362 | // Mode 1 1363 | if ((best_err > 0) && (pComp_params->m_max_partitions_mode1 > 0)) 1364 | { 1365 | const uint32_t trial_partition = estimate_partition(pPixels, pComp_params, pParams->m_weights); 1366 | pParams->m_pSelector_weights = g_bc7_weights3; 1367 | pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights3x; 1368 | pParams->m_num_selector_weights = 8; 1369 | pParams->m_comp_bits = 6; 1370 | pParams->m_has_pbits = BC7ENC16_TRUE; 1371 | pParams->m_endpoints_share_pbit = BC7ENC16_TRUE; 1372 | 1373 | const uint8_t *pPartition = &g_bc7_partition2[trial_partition * 16]; 1374 | 1375 | color_quad_u8 subset_colors[2][16]; 1376 | 1377 | uint32_t subset_total_colors1[2] = { 0, 0 }; 1378 | 1379 | uint8_t subset_pixel_index1[2][16]; 1380 | uint8_t subset_selectors1[2][16]; 1381 | color_cell_compressor_results subset_results1[2]; 1382 | 1383 | for (uint32_t idx = 0; idx < 16; idx++) 1384 | { 1385 | const uint32_t p = pPartition[idx]; 1386 | subset_colors[p][subset_total_colors1[p]] = pPixels[idx]; 1387 | subset_pixel_index1[p][subset_total_colors1[p]] = (uint8_t)idx; 1388 | subset_total_colors1[p]++; 1389 | } 1390 | 1391 | uint64_t trial_err = 0; 1392 | for (uint32_t subset = 0; subset < 2; subset++) 1393 | { 1394 | pParams->m_num_pixels = subset_total_colors1[subset]; 1395 | pParams->m_pPixels = &subset_colors[subset][0]; 1396 | 1397 | color_cell_compressor_results *pResults = &subset_results1[subset]; 1398 | pResults->m_pSelectors = &subset_selectors1[subset][0]; 1399 | pResults->m_pSelectors_temp = selectors_temp; 1400 | uint64_t err = color_cell_compression(1, pParams, pResults, pComp_params); 1401 | trial_err += err; 1402 | if (trial_err > best_err) 1403 | break; 1404 | 1405 | } // subset 1406 | 1407 | if (trial_err < best_err) 1408 | { 1409 | best_err = trial_err; 1410 | opt_results.m_mode = 1; 1411 | opt_results.m_partition = trial_partition; 1412 | for (uint32_t subset = 0; subset < 2; subset++) 1413 | { 1414 | for (uint32_t i = 0; i < subset_total_colors1[subset]; i++) 1415 | opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i]; 1416 | opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint; 1417 | opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint; 1418 | opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0]; 1419 | } 1420 | } 1421 | } 1422 | 1423 | encode_bc7_block(pBlock, &opt_results); 1424 | } 1425 | 1426 | bc7enc16_bool bc7enc16_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc16_compress_block_params *pComp_params) 1427 | { 1428 | assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0); 1429 | 1430 | const color_quad_u8 *pPixels = (const color_quad_u8 *)(pPixelsRGBA); 1431 | 1432 | color_cell_compressor_params params; 1433 | if (pComp_params->m_perceptual) 1434 | { 1435 | // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion 1436 | const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f)); 1437 | const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f)); 1438 | params.m_weights[0] = (int)(pComp_params->m_weights[0] * 4.0f); 1439 | params.m_weights[1] = (int)(pComp_params->m_weights[1] * 4.0f * pr_weight); 1440 | params.m_weights[2] = (int)(pComp_params->m_weights[2] * 4.0f * pb_weight); 1441 | params.m_weights[3] = pComp_params->m_weights[3] * 4; 1442 | } 1443 | else 1444 | memcpy(params.m_weights, pComp_params->m_weights, sizeof(params.m_weights)); 1445 | 1446 | for (uint32_t i = 0; i < 16; i++) 1447 | { 1448 | if (pPixels[i].m_c[3] < 255) 1449 | { 1450 | handle_alpha_block(pBlock, pPixels, pComp_params, ¶ms); 1451 | return BC7ENC16_TRUE; 1452 | } 1453 | } 1454 | handle_opaque_block(pBlock, pPixels, pComp_params, ¶ms); 1455 | return BC7ENC16_FALSE; 1456 | } 1457 | 1458 | /* 1459 | ------------------------------------------------------------------------------ 1460 | This software is available under 2 licenses -- choose whichever you prefer. 1461 | ------------------------------------------------------------------------------ 1462 | ALTERNATIVE A - MIT License 1463 | Copyright(c) 2018 Richard Geldreich, Jr. 1464 | Permission is hereby granted, free of charge, to any person obtaining a copy of 1465 | this software and associated documentation files(the "Software"), to deal in 1466 | the Software without restriction, including without limitation the rights to 1467 | use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies 1468 | of the Software, and to permit persons to whom the Software is furnished to do 1469 | so, subject to the following conditions : 1470 | The above copyright notice and this permission notice shall be included in all 1471 | copies or substantial portions of the Software. 1472 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1473 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1474 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 1475 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1476 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1477 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1478 | SOFTWARE. 1479 | ------------------------------------------------------------------------------ 1480 | ALTERNATIVE B - Public Domain(www.unlicense.org) 1481 | This is free and unencumbered software released into the public domain. 1482 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 1483 | software, either in source code form or as a compiled binary, for any purpose, 1484 | commercial or non - commercial, and by any means. 1485 | In jurisdictions that recognize copyright laws, the author or authors of this 1486 | software dedicate any and all copyright interest in the software to the public 1487 | domain.We make this dedication for the benefit of the public at large and to 1488 | the detriment of our heirs and successors.We intend this dedication to be an 1489 | overt act of relinquishment in perpetuity of all present and future rights to 1490 | this software under copyright law. 1491 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1492 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1493 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 1494 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 1495 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 1496 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 1497 | ------------------------------------------------------------------------------ 1498 | */ 1499 | -------------------------------------------------------------------------------- /bc7enc16.h: -------------------------------------------------------------------------------- 1 | // File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c) 2 | #include 3 | #include 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | #define BC7ENC16_BLOCK_SIZE (16) 10 | #define BC7ENC16_MAX_PARTITIONS1 (64) 11 | #define BC7ENC16_MAX_UBER_LEVEL (4) 12 | 13 | typedef uint8_t bc7enc16_bool; 14 | #define BC7ENC16_TRUE (1) 15 | #define BC7ENC16_FALSE (0) 16 | 17 | typedef struct 18 | { 19 | // m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC16_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality. 20 | uint32_t m_max_partitions_mode1; 21 | 22 | // Relative RGBA or YCbCrA weights. 23 | uint32_t m_weights[4]; 24 | 25 | // m_uber_level may range from 0 to BC7ENC16_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality. 26 | uint32_t m_uber_level; 27 | 28 | // If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB. 29 | bc7enc16_bool m_perceptual; 30 | 31 | // Set m_try_least_squares to false for slightly faster/lower quality compression. 32 | bc7enc16_bool m_try_least_squares; 33 | 34 | // When m_mode1_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful. 35 | // There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings. 36 | bc7enc16_bool m_mode1_partition_estimation_filterbank; 37 | 38 | } bc7enc16_compress_block_params; 39 | 40 | inline void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p) 41 | { 42 | p->m_perceptual = BC7ENC16_FALSE; 43 | p->m_weights[0] = 1; 44 | p->m_weights[1] = 1; 45 | p->m_weights[2] = 1; 46 | p->m_weights[3] = 1; 47 | } 48 | 49 | inline void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p) 50 | { 51 | p->m_perceptual = BC7ENC16_TRUE; 52 | p->m_weights[0] = 128; 53 | p->m_weights[1] = 64; 54 | p->m_weights[2] = 16; 55 | p->m_weights[3] = 32; 56 | } 57 | 58 | inline void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p) 59 | { 60 | p->m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1; 61 | p->m_try_least_squares = BC7ENC16_TRUE; 62 | p->m_mode1_partition_estimation_filterbank = BC7ENC16_TRUE; 63 | p->m_uber_level = 0; 64 | bc7enc16_compress_block_params_init_perceptual_weights(p); 65 | } 66 | 67 | // bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts). 68 | void bc7enc16_compress_block_init(); 69 | 70 | // Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6. 71 | // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6. 72 | // Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.) 73 | bc7enc16_bool bc7enc16_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc16_compress_block_params *pComp_params); 74 | 75 | #ifdef __cplusplus 76 | } 77 | #endif 78 | -------------------------------------------------------------------------------- /build_msvc.cmd: -------------------------------------------------------------------------------- 1 | cmake -G "Visual Studio 14 2015 Win64" . 2 | -------------------------------------------------------------------------------- /dds_defs.h: -------------------------------------------------------------------------------- 1 | // File: dds_defs.h 2 | // DX9 .DDS file header definitions. 3 | #pragma once 4 | 5 | #define PIXEL_FMT_FOURCC(a, b, c, d) ((a) | ((b) << 8U) | ((c) << 16U) | ((d) << 24U)) 6 | 7 | enum pixel_format 8 | { 9 | PIXEL_FMT_INVALID = 0, 10 | 11 | PIXEL_FMT_DXT1 = PIXEL_FMT_FOURCC('D', 'X', 'T', '1'), 12 | PIXEL_FMT_DXT2 = PIXEL_FMT_FOURCC('D', 'X', 'T', '2'), 13 | PIXEL_FMT_DXT3 = PIXEL_FMT_FOURCC('D', 'X', 'T', '3'), 14 | PIXEL_FMT_DXT4 = PIXEL_FMT_FOURCC('D', 'X', 'T', '4'), 15 | PIXEL_FMT_DXT5 = PIXEL_FMT_FOURCC('D', 'X', 'T', '5'), 16 | PIXEL_FMT_3DC = PIXEL_FMT_FOURCC('A', 'T', 'I', '2'), // DXN_YX 17 | PIXEL_FMT_DXN = PIXEL_FMT_FOURCC('A', '2', 'X', 'Y'), // DXN_XY 18 | PIXEL_FMT_DXT5A = PIXEL_FMT_FOURCC('A', 'T', 'I', '1'), // ATI1N, http://developer.amd.com/media/gpu_assets/Radeon_X1x00_Programming_Guide.pdf 19 | 20 | // Non-standard formats (some of these are supported by ATI's Compressonator) 21 | PIXEL_FMT_DXT5_CCxY = PIXEL_FMT_FOURCC('C', 'C', 'x', 'Y'), 22 | PIXEL_FMT_DXT5_xGxR = PIXEL_FMT_FOURCC('x', 'G', 'x', 'R'), 23 | PIXEL_FMT_DXT5_xGBR = PIXEL_FMT_FOURCC('x', 'G', 'B', 'R'), 24 | PIXEL_FMT_DXT5_AGBR = PIXEL_FMT_FOURCC('A', 'G', 'B', 'R'), 25 | 26 | PIXEL_FMT_DXT1A = PIXEL_FMT_FOURCC('D', 'X', '1', 'A'), 27 | PIXEL_FMT_ETC1 = PIXEL_FMT_FOURCC('E', 'T', 'C', '1'), 28 | 29 | PIXEL_FMT_R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'x'), 30 | PIXEL_FMT_L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'x'), 31 | PIXEL_FMT_A8 = PIXEL_FMT_FOURCC('x', 'x', 'x', 'A'), 32 | PIXEL_FMT_A8L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'A'), 33 | PIXEL_FMT_A8R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'A') 34 | }; 35 | 36 | const uint32_t cDDSMaxImageDimensions = 8192U; 37 | 38 | // Total size of header is sizeof(uint32)+cDDSSizeofDDSurfaceDesc2; 39 | const uint32_t cDDSSizeofDDSurfaceDesc2 = 124; 40 | 41 | // "DDS " 42 | const uint32_t cDDSFileSignature = 0x20534444; 43 | 44 | struct DDCOLORKEY 45 | { 46 | uint32_t dwUnused0; 47 | uint32_t dwUnused1; 48 | }; 49 | 50 | struct DDPIXELFORMAT 51 | { 52 | uint32_t dwSize; 53 | uint32_t dwFlags; 54 | uint32_t dwFourCC; 55 | uint32_t dwRGBBitCount; // ATI compressonator will place a FOURCC code here for swizzled/cooked DXTn formats 56 | uint32_t dwRBitMask; 57 | uint32_t dwGBitMask; 58 | uint32_t dwBBitMask; 59 | uint32_t dwRGBAlphaBitMask; 60 | }; 61 | 62 | struct DDSCAPS2 63 | { 64 | uint32_t dwCaps; 65 | uint32_t dwCaps2; 66 | uint32_t dwCaps3; 67 | uint32_t dwCaps4; 68 | }; 69 | 70 | struct DDSURFACEDESC2 71 | { 72 | uint32_t dwSize; 73 | uint32_t dwFlags; 74 | uint32_t dwHeight; 75 | uint32_t dwWidth; 76 | union 77 | { 78 | int32_t lPitch; 79 | uint32_t dwLinearSize; 80 | }; 81 | uint32_t dwBackBufferCount; 82 | uint32_t dwMipMapCount; 83 | uint32_t dwAlphaBitDepth; 84 | uint32_t dwUnused0; 85 | uint32_t lpSurface; 86 | DDCOLORKEY unused0; 87 | DDCOLORKEY unused1; 88 | DDCOLORKEY unused2; 89 | DDCOLORKEY unused3; 90 | DDPIXELFORMAT ddpfPixelFormat; 91 | DDSCAPS2 ddsCaps; 92 | uint32_t dwUnused1; 93 | }; 94 | 95 | const uint32_t DDSD_CAPS = 0x00000001; 96 | const uint32_t DDSD_HEIGHT = 0x00000002; 97 | const uint32_t DDSD_WIDTH = 0x00000004; 98 | const uint32_t DDSD_PITCH = 0x00000008; 99 | 100 | const uint32_t DDSD_BACKBUFFERCOUNT = 0x00000020; 101 | const uint32_t DDSD_ZBUFFERBITDEPTH = 0x00000040; 102 | const uint32_t DDSD_ALPHABITDEPTH = 0x00000080; 103 | 104 | const uint32_t DDSD_LPSURFACE = 0x00000800; 105 | 106 | const uint32_t DDSD_PIXELFORMAT = 0x00001000; 107 | const uint32_t DDSD_CKDESTOVERLAY = 0x00002000; 108 | const uint32_t DDSD_CKDESTBLT = 0x00004000; 109 | const uint32_t DDSD_CKSRCOVERLAY = 0x00008000; 110 | 111 | const uint32_t DDSD_CKSRCBLT = 0x00010000; 112 | const uint32_t DDSD_MIPMAPCOUNT = 0x00020000; 113 | const uint32_t DDSD_REFRESHRATE = 0x00040000; 114 | const uint32_t DDSD_LINEARSIZE = 0x00080000; 115 | 116 | const uint32_t DDSD_TEXTURESTAGE = 0x00100000; 117 | const uint32_t DDSD_FVF = 0x00200000; 118 | const uint32_t DDSD_SRCVBHANDLE = 0x00400000; 119 | const uint32_t DDSD_DEPTH = 0x00800000; 120 | 121 | const uint32_t DDSD_ALL = 0x00fff9ee; 122 | 123 | const uint32_t DDPF_ALPHAPIXELS = 0x00000001; 124 | const uint32_t DDPF_ALPHA = 0x00000002; 125 | const uint32_t DDPF_FOURCC = 0x00000004; 126 | const uint32_t DDPF_PALETTEINDEXED8 = 0x00000020; 127 | const uint32_t DDPF_RGB = 0x00000040; 128 | const uint32_t DDPF_LUMINANCE = 0x00020000; 129 | 130 | const uint32_t DDSCAPS_COMPLEX = 0x00000008; 131 | const uint32_t DDSCAPS_TEXTURE = 0x00001000; 132 | const uint32_t DDSCAPS_MIPMAP = 0x00400000; 133 | 134 | const uint32_t DDSCAPS2_CUBEMAP = 0x00000200; 135 | const uint32_t DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400; 136 | const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800; 137 | 138 | const uint32_t DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000; 139 | const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000; 140 | const uint32_t DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000; 141 | const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000; 142 | 143 | const uint32_t DDSCAPS2_VOLUME = 0x00200000; 144 | 145 | typedef enum DXGI_FORMAT 146 | { 147 | DXGI_FORMAT_UNKNOWN = 0, 148 | DXGI_FORMAT_R32G32B32A32_TYPELESS = 1, 149 | DXGI_FORMAT_R32G32B32A32_FLOAT = 2, 150 | DXGI_FORMAT_R32G32B32A32_UINT = 3, 151 | DXGI_FORMAT_R32G32B32A32_SINT = 4, 152 | DXGI_FORMAT_R32G32B32_TYPELESS = 5, 153 | DXGI_FORMAT_R32G32B32_FLOAT = 6, 154 | DXGI_FORMAT_R32G32B32_UINT = 7, 155 | DXGI_FORMAT_R32G32B32_SINT = 8, 156 | DXGI_FORMAT_R16G16B16A16_TYPELESS = 9, 157 | DXGI_FORMAT_R16G16B16A16_FLOAT = 10, 158 | DXGI_FORMAT_R16G16B16A16_UNORM = 11, 159 | DXGI_FORMAT_R16G16B16A16_UINT = 12, 160 | DXGI_FORMAT_R16G16B16A16_SNORM = 13, 161 | DXGI_FORMAT_R16G16B16A16_SINT = 14, 162 | DXGI_FORMAT_R32G32_TYPELESS = 15, 163 | DXGI_FORMAT_R32G32_FLOAT = 16, 164 | DXGI_FORMAT_R32G32_UINT = 17, 165 | DXGI_FORMAT_R32G32_SINT = 18, 166 | DXGI_FORMAT_R32G8X24_TYPELESS = 19, 167 | DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20, 168 | DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21, 169 | DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22, 170 | DXGI_FORMAT_R10G10B10A2_TYPELESS = 23, 171 | DXGI_FORMAT_R10G10B10A2_UNORM = 24, 172 | DXGI_FORMAT_R10G10B10A2_UINT = 25, 173 | DXGI_FORMAT_R11G11B10_FLOAT = 26, 174 | DXGI_FORMAT_R8G8B8A8_TYPELESS = 27, 175 | DXGI_FORMAT_R8G8B8A8_UNORM = 28, 176 | DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29, 177 | DXGI_FORMAT_R8G8B8A8_UINT = 30, 178 | DXGI_FORMAT_R8G8B8A8_SNORM = 31, 179 | DXGI_FORMAT_R8G8B8A8_SINT = 32, 180 | DXGI_FORMAT_R16G16_TYPELESS = 33, 181 | DXGI_FORMAT_R16G16_FLOAT = 34, 182 | DXGI_FORMAT_R16G16_UNORM = 35, 183 | DXGI_FORMAT_R16G16_UINT = 36, 184 | DXGI_FORMAT_R16G16_SNORM = 37, 185 | DXGI_FORMAT_R16G16_SINT = 38, 186 | DXGI_FORMAT_R32_TYPELESS = 39, 187 | DXGI_FORMAT_D32_FLOAT = 40, 188 | DXGI_FORMAT_R32_FLOAT = 41, 189 | DXGI_FORMAT_R32_UINT = 42, 190 | DXGI_FORMAT_R32_SINT = 43, 191 | DXGI_FORMAT_R24G8_TYPELESS = 44, 192 | DXGI_FORMAT_D24_UNORM_S8_UINT = 45, 193 | DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46, 194 | DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47, 195 | DXGI_FORMAT_R8G8_TYPELESS = 48, 196 | DXGI_FORMAT_R8G8_UNORM = 49, 197 | DXGI_FORMAT_R8G8_UINT = 50, 198 | DXGI_FORMAT_R8G8_SNORM = 51, 199 | DXGI_FORMAT_R8G8_SINT = 52, 200 | DXGI_FORMAT_R16_TYPELESS = 53, 201 | DXGI_FORMAT_R16_FLOAT = 54, 202 | DXGI_FORMAT_D16_UNORM = 55, 203 | DXGI_FORMAT_R16_UNORM = 56, 204 | DXGI_FORMAT_R16_UINT = 57, 205 | DXGI_FORMAT_R16_SNORM = 58, 206 | DXGI_FORMAT_R16_SINT = 59, 207 | DXGI_FORMAT_R8_TYPELESS = 60, 208 | DXGI_FORMAT_R8_UNORM = 61, 209 | DXGI_FORMAT_R8_UINT = 62, 210 | DXGI_FORMAT_R8_SNORM = 63, 211 | DXGI_FORMAT_R8_SINT = 64, 212 | DXGI_FORMAT_A8_UNORM = 65, 213 | DXGI_FORMAT_R1_UNORM = 66, 214 | DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67, 215 | DXGI_FORMAT_R8G8_B8G8_UNORM = 68, 216 | DXGI_FORMAT_G8R8_G8B8_UNORM = 69, 217 | DXGI_FORMAT_BC1_TYPELESS = 70, 218 | DXGI_FORMAT_BC1_UNORM = 71, 219 | DXGI_FORMAT_BC1_UNORM_SRGB = 72, 220 | DXGI_FORMAT_BC2_TYPELESS = 73, 221 | DXGI_FORMAT_BC2_UNORM = 74, 222 | DXGI_FORMAT_BC2_UNORM_SRGB = 75, 223 | DXGI_FORMAT_BC3_TYPELESS = 76, 224 | DXGI_FORMAT_BC3_UNORM = 77, 225 | DXGI_FORMAT_BC3_UNORM_SRGB = 78, 226 | DXGI_FORMAT_BC4_TYPELESS = 79, 227 | DXGI_FORMAT_BC4_UNORM = 80, 228 | DXGI_FORMAT_BC4_SNORM = 81, 229 | DXGI_FORMAT_BC5_TYPELESS = 82, 230 | DXGI_FORMAT_BC5_UNORM = 83, 231 | DXGI_FORMAT_BC5_SNORM = 84, 232 | DXGI_FORMAT_B5G6R5_UNORM = 85, 233 | DXGI_FORMAT_B5G5R5A1_UNORM = 86, 234 | DXGI_FORMAT_B8G8R8A8_UNORM = 87, 235 | DXGI_FORMAT_B8G8R8X8_UNORM = 88, 236 | DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89, 237 | DXGI_FORMAT_B8G8R8A8_TYPELESS = 90, 238 | DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91, 239 | DXGI_FORMAT_B8G8R8X8_TYPELESS = 92, 240 | DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93, 241 | DXGI_FORMAT_BC6H_TYPELESS = 94, 242 | DXGI_FORMAT_BC6H_UF16 = 95, 243 | DXGI_FORMAT_BC6H_SF16 = 96, 244 | DXGI_FORMAT_BC7_TYPELESS = 97, 245 | DXGI_FORMAT_BC7_UNORM = 98, 246 | DXGI_FORMAT_BC7_UNORM_SRGB = 99, 247 | DXGI_FORMAT_AYUV = 100, 248 | DXGI_FORMAT_Y410 = 101, 249 | DXGI_FORMAT_Y416 = 102, 250 | DXGI_FORMAT_NV12 = 103, 251 | DXGI_FORMAT_P010 = 104, 252 | DXGI_FORMAT_P016 = 105, 253 | DXGI_FORMAT_420_OPAQUE = 106, 254 | DXGI_FORMAT_YUY2 = 107, 255 | DXGI_FORMAT_Y210 = 108, 256 | DXGI_FORMAT_Y216 = 109, 257 | DXGI_FORMAT_NV11 = 110, 258 | DXGI_FORMAT_AI44 = 111, 259 | DXGI_FORMAT_IA44 = 112, 260 | DXGI_FORMAT_P8 = 113, 261 | DXGI_FORMAT_A8P8 = 114, 262 | DXGI_FORMAT_B4G4R4A4_UNORM = 115, 263 | DXGI_FORMAT_P208 = 130, 264 | DXGI_FORMAT_V208 = 131, 265 | DXGI_FORMAT_V408 = 132, 266 | DXGI_FORMAT_FORCE_UINT = 0xffffffff 267 | } DXGI_FORMAT; 268 | 269 | enum D3D10_RESOURCE_DIMENSION 270 | { 271 | D3D10_RESOURCE_DIMENSION_UNKNOWN = 0, 272 | D3D10_RESOURCE_DIMENSION_BUFFER = 1, 273 | D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2, 274 | D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3, 275 | D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4 276 | }; 277 | 278 | struct DDS_HEADER_DXT10 279 | { 280 | DXGI_FORMAT dxgiFormat; 281 | D3D10_RESOURCE_DIMENSION resourceDimension; 282 | uint32_t miscFlag; 283 | uint32_t arraySize; 284 | uint32_t miscFlags2; 285 | }; 286 | 287 | -------------------------------------------------------------------------------- /ktx_defs.h: -------------------------------------------------------------------------------- 1 | // File: ktx_defs.h 2 | // .KTX file header definitions. 3 | // -> https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/ 4 | #pragma once 5 | 6 | 7 | // OpenGL constants 8 | #define GL_RGB 0x1907 9 | #define GL_RGBA 0x1908 10 | #define GL_COMPRESSED_RGBA_BPTC_UNORM 0x8E8C 11 | #define GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM 0x8E8D 12 | 13 | 14 | constexpr char const * ktxFileNameExt = ".ktx"; 15 | 16 | static constexpr uint8_t ktxFileIdentifier[12] = 17 | { 18 | 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A 19 | }; 20 | 21 | static constexpr uint32_t ktxEndianess = 0x04030201; 22 | 23 | static constexpr uint8_t ktxOrientation[] = 24 | { 25 | 'K','T','X','o','r','i','e','n','t','a','t','i','o','n',0, 26 | 'S','=','r',',','T','=','d',0 27 | }; 28 | 29 | 30 | struct KTX_HEADER 31 | { 32 | uint8_t identifier[12]; // 0x00 33 | uint32_t endianness; // 0x0c 34 | uint32_t glType; // 0x10 35 | uint32_t glTypeSize; 36 | uint32_t glFormat; // 0x18 37 | uint32_t glInternalFormat; 38 | uint32_t glBaseInternalFormat; // 0x20 39 | uint32_t pixelWidth; 40 | uint32_t pixelHeight; // 0x28 41 | uint32_t pixelDepth; 42 | uint32_t numberOfArrayElements; // 0x30 43 | uint32_t numberOfFaces; 44 | uint32_t numberOfMipmapLevels; // 0x38 45 | uint32_t bytesOfKeyValueData; // 0x3c 46 | }; 47 | --------------------------------------------------------------------------------