├── .gitignore ├── tests ├── images │ ├── brick.png │ ├── blender.png │ ├── marble.png │ ├── brick-alpha.png │ └── marble-alpha.png ├── multi_tasks.rs ├── common │ └── mod.rs └── metrics.rs ├── rustfmt.toml ├── compressor ├── Cargo.toml └── src │ └── main.rs ├── README.md ├── LICENSE ├── Cargo.toml ├── CHANGELOG.md └── src ├── lib.rs ├── settings.rs ├── encode ├── bc1_to_5.rs ├── common.rs └── bc7.rs ├── encode.rs ├── decode.rs ├── shader └── bc1_to_5.wgsl └── block_compressor.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | /target 3 | Cargo.lock 4 | *.png 5 | *.dds 6 | -------------------------------------------------------------------------------- /tests/images/brick.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hasenbanck/block_compression/HEAD/tests/images/brick.png -------------------------------------------------------------------------------- /tests/images/blender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hasenbanck/block_compression/HEAD/tests/images/blender.png -------------------------------------------------------------------------------- /tests/images/marble.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hasenbanck/block_compression/HEAD/tests/images/marble.png -------------------------------------------------------------------------------- /tests/images/brick-alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hasenbanck/block_compression/HEAD/tests/images/brick-alpha.png -------------------------------------------------------------------------------- /tests/images/marble-alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hasenbanck/block_compression/HEAD/tests/images/marble-alpha.png -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | # These don't change the "standard" how Rust programs normaly look, but make things more consistent. 2 | format_code_in_doc_comments = true 3 | hex_literal_case = "Upper" 4 | imports_granularity = "Crate" 5 | group_imports = "StdExternalCrate" 6 | use_try_shorthand = true 7 | -------------------------------------------------------------------------------- /compressor/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "compressor" 3 | version = "0.1.0" 4 | publish = false 5 | authors.workspace = true 6 | edition.workspace = true 7 | rust-version.workspace = true 8 | 9 | [dependencies] 10 | block_compression = { path = ".." } 11 | bytemuck = { workspace = true } 12 | ddsfile = { workspace = true } 13 | image = { workspace = true, features = ["bmp", "png", "tga"] } 14 | pollster = { workspace = true } 15 | wgpu = { workspace = true, features = ["static-dxc"] } 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # block_compression 2 | 3 | [![Crate](https://img.shields.io/crates/v/block_compression.svg)](https://crates.io/crates/block_compression) 4 | [![API](https://docs.rs/block_compression/badge.svg)](https://docs.rs/block_compression) 5 | 6 | Texture block compression using WGPU compute shader. 7 | The shaders are a port of Intel's ISPC Texture Compressor's kernel to WGSL compute shader. 8 | 9 | Tested with the following backends: 10 | 11 | * DX12 12 | * Metal 13 | * Vulkan 14 | 15 | ## Supported block compressions 16 | 17 | Currently supported block compressions are: 18 | 19 | * BC1 20 | * BC2 21 | * BC3 22 | * BC4 23 | * BC5 24 | * BC6H 25 | * BC7 26 | 27 | ## DX12 pipeline creation 28 | 29 | The pipeline creation for BC7 and especially BC6H takes a long time under DX12. The DXC compiler seems to take a very 30 | long time to compile the shader. For this reason we moved them behind features, which are included in the default 31 | features. 32 | 33 | ## License 34 | 35 | This project is licensed under the [MIT](LICENSE) license. 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025, Nils Hasenbanck 2 | Copyright (c) 2016-2024, Intel Corporation 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 15 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 16 | SOFTWARE. 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace.package] 2 | authors = ["Nils Hasenbanck "] 3 | edition = "2021" 4 | rust-version = "1.80" 5 | 6 | [package] 7 | name = "block_compression" 8 | description = "Texture block compression using WGPU compute shader" 9 | version = "0.7.0" 10 | license = "MIT" 11 | documentation = "https://docs.rs/block_compression" 12 | repository = "https://github.com/hasenbanck/block_compression" 13 | authors.workspace = true 14 | edition.workspace = true 15 | rust-version.workspace = true 16 | keywords = ["texture", "image", "compress", "wgpu"] 17 | categories = ["rendering", "rendering::engine"] 18 | exclude = ["tests/images/"] 19 | 20 | [badges] 21 | maintenance = { status = "actively-developed" } 22 | 23 | [features] 24 | default = ["bc15", "bc6h", "bc7", "wgpu"] 25 | bc15 = [] 26 | bc6h = ["half"] 27 | bc7 = [] 28 | 29 | [dependencies] 30 | bytemuck = { workspace = true, features = ["derive"] } 31 | half = { workspace = true, optional = true, features = ["bytemuck"] } 32 | wgpu = { workspace = true, optional = true } 33 | 34 | [dev-dependencies] 35 | image = { workspace = true, features = ["png"] } 36 | pollster = { workspace = true } 37 | wgpu = { workspace = true, features = ["static-dxc"] } 38 | 39 | [workspace.dependencies] 40 | bytemuck = "1" 41 | ddsfile = "0.5" 42 | half = "2" 43 | image = { version = "0.25", default-features = false } 44 | pollster = "0.4" 45 | wgpu = "27" 46 | 47 | [package.metadata.docs.rs] 48 | features = ["bc6h", "bc7", "half"] 49 | rustdoc-args = ["--cfg", "docsrs"] 50 | 51 | [workspace] 52 | members = [ 53 | "compressor", 54 | ] 55 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.7.0] - 2025-10-02 9 | 10 | ### Updated 11 | 12 | - Target WGPU 27 13 | 14 | ## [0.6.0] - 2025-07-23 15 | 16 | ### Updated 17 | 18 | - Target WGPU 26 19 | 20 | ## [0.5.0] - 2025-06-02 21 | 22 | ### Changed 23 | 24 | - `GpuBlockCompressor::new()` takes the WGPU device and queue directly without an Arc wrapped around it. WGPU 25 25 | made the main structures clonable, since they are internally reference counted, so it's not needed anymore to wrap 26 | them in a smart pointer anymore. 27 | 28 | ### Fixed 29 | 30 | - Fix an issue with AMD integrated GPU's where WGPU's forced loop bounding in shaders made running the BC7 shader 31 | impossible. 32 | 33 | ## [0.4.0] - 2025-04-11 34 | 35 | ### Updated 36 | 37 | - Target WGPU 25 38 | 39 | ## [0.3.0] - 2025-02-21 40 | 41 | ### Updated 42 | 43 | - Allow the GPU compressor to use row based offsets into the texture to 44 | allow submitting smaller chunks of work. 45 | 46 | ## [0.2.1] - 2025-02-17 47 | 48 | ### Updated 49 | 50 | - Fix BC6H encoding for black pixels 51 | - Use adapter limits in the example compressor 52 | - Improve PSNR output CPU of when compared to the GPU versions of BC6H / BC7 53 | 54 | ## [0.2.0] - 2025-01-22 55 | 56 | ### Added 57 | 58 | - Provide more feature flags for optional features 59 | - Implemented CPU based BC6H encoding 60 | - Implemented CPU based BC7 encoding 61 | 62 | ## [0.1.1] - 2025-01-20 63 | 64 | ### Updated 65 | 66 | - Fix compilation with no default features. 67 | 68 | ## [0.1.0] - 2025-01-20 69 | 70 | ### Added 71 | 72 | - Initial release. 73 | -------------------------------------------------------------------------------- /tests/multi_tasks.rs: -------------------------------------------------------------------------------- 1 | use block_compression::*; 2 | use wgpu::{CommandEncoderDescriptor, ComputePassDescriptor, TextureViewDescriptor}; 3 | 4 | use crate::common::{ 5 | create_blocks_buffer, create_wgpu_resources, download_blocks_data, 6 | read_image_and_create_texture, BRICK_FILE_PATH, MARBLE_FILE_PATH, 7 | }; 8 | 9 | mod common; 10 | 11 | fn test_multi_task_compression(variant: CompressionVariant) { 12 | let (device, queue) = create_wgpu_resources(); 13 | let mut block_compressor = GpuBlockCompressor::new(device.clone(), queue.clone()); 14 | 15 | let (brick_texture, _) = 16 | read_image_and_create_texture(&device, &queue, BRICK_FILE_PATH, variant); 17 | let (marble_texture, _) = 18 | read_image_and_create_texture(&device, &queue, MARBLE_FILE_PATH, variant); 19 | 20 | let brick_height = brick_texture.height(); 21 | let marble_height = marble_texture.height(); 22 | 23 | // Split heights in half (rounded to multiple of 4) 24 | let brick_half_height = (brick_height / 2) & !3; 25 | let marble_half_height = (marble_height / 2) & !3; 26 | 27 | let bricks_half_size = variant.blocks_byte_size(brick_texture.width(), brick_half_height); 28 | let marble_half_size = variant.blocks_byte_size(marble_texture.width(), marble_half_height); 29 | let total_size = (bricks_half_size * 2) + (marble_half_size * 2); 30 | 31 | let blocks = create_blocks_buffer(&device, total_size as u64); 32 | 33 | block_compressor.add_compression_task( 34 | variant, 35 | &brick_texture.create_view(&TextureViewDescriptor::default()), 36 | brick_texture.width(), 37 | brick_half_height, 38 | &blocks, 39 | None, 40 | None, 41 | ); 42 | block_compressor.add_compression_task( 43 | variant, 44 | &brick_texture.create_view(&TextureViewDescriptor::default()), 45 | brick_texture.width(), 46 | brick_half_height, 47 | &blocks, 48 | Some(brick_half_height), 49 | Some(bricks_half_size as u32), 50 | ); 51 | 52 | block_compressor.add_compression_task( 53 | variant, 54 | &marble_texture.create_view(&TextureViewDescriptor::default()), 55 | marble_texture.width(), 56 | marble_half_height, 57 | &blocks, 58 | None, 59 | Some((bricks_half_size * 2) as u32), 60 | ); 61 | block_compressor.add_compression_task( 62 | variant, 63 | &marble_texture.create_view(&TextureViewDescriptor::default()), 64 | marble_texture.width(), 65 | marble_half_height, 66 | &blocks, 67 | Some(marble_half_height), 68 | Some((bricks_half_size * 2 + marble_half_size) as u32), 69 | ); 70 | 71 | let mut encoder = device.create_command_encoder(&CommandEncoderDescriptor { 72 | label: Some("command encoder"), 73 | }); 74 | 75 | { 76 | let mut pass = encoder.begin_compute_pass(&ComputePassDescriptor { 77 | label: Some("compute pass"), 78 | timestamp_writes: None, 79 | }); 80 | 81 | block_compressor.compress(&mut pass); 82 | } 83 | 84 | queue.submit([encoder.finish()]); 85 | 86 | let blocks_data = download_blocks_data(&device, &queue, blocks); 87 | 88 | let brick_first_half_not_empty = !blocks_data[..bricks_half_size] 89 | .iter() 90 | .all(|&data| data == 0); 91 | let brick_second_half_not_empty = !blocks_data[bricks_half_size..bricks_half_size * 2] 92 | .iter() 93 | .all(|&data| data == 0); 94 | let marble_first_half_not_empty = !blocks_data 95 | [bricks_half_size * 2..bricks_half_size * 2 + marble_half_size] 96 | .iter() 97 | .all(|&data| data == 0); 98 | let marble_second_half_not_empty = !blocks_data[bricks_half_size * 2 + marble_half_size..] 99 | .iter() 100 | .all(|&data| data == 0); 101 | 102 | assert!(brick_first_half_not_empty, "Brick first half is empty"); 103 | assert!(brick_second_half_not_empty, "Brick second half is empty"); 104 | assert!(marble_first_half_not_empty, "Marble first half is empty"); 105 | assert!(marble_second_half_not_empty, "Marble second half is empty"); 106 | } 107 | 108 | #[test] 109 | fn multi_task_compression_bc1() { 110 | test_multi_task_compression(CompressionVariant::BC1); 111 | } 112 | 113 | #[test] 114 | fn multi_task_compression_bc2() { 115 | test_multi_task_compression(CompressionVariant::BC2); 116 | } 117 | 118 | #[test] 119 | fn multi_task_compression_bc3() { 120 | test_multi_task_compression(CompressionVariant::BC3); 121 | } 122 | 123 | #[test] 124 | fn multi_task_compression_bc4() { 125 | test_multi_task_compression(CompressionVariant::BC4); 126 | } 127 | 128 | #[test] 129 | fn multi_task_compression_bc5() { 130 | test_multi_task_compression(CompressionVariant::BC5); 131 | } 132 | 133 | #[test] 134 | fn multi_task_compression_bc6h() { 135 | test_multi_task_compression(CompressionVariant::BC6H(BC6HSettings::very_fast())); 136 | } 137 | 138 | #[test] 139 | fn multi_task_compression_bc7() { 140 | test_multi_task_compression(CompressionVariant::BC7(BC7Settings::opaque_ultra_fast())); 141 | } 142 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # block_compression 2 | //! 3 | //! Texture block compression using WGPU compute shader. 4 | //! The shaders are a port of Intel's ISPC Texture Compressor's kernel to WGSL compute shader. 5 | //! 6 | //! Tested with the following backends: 7 | //! 8 | //! * DX12 9 | //! * Metal 10 | //! * Vulkan 11 | //! 12 | //! ## DX12 pipeline creation 13 | //! 14 | //! The pipeline creation for BC7 and especially BC6H takes a long time under DX12. The DXC compiler 15 | //! seems to take a very long time to compile the shader. For this reason we moved them behind 16 | //! features, which are included in the default features. 17 | //! 18 | //! ## Supported block compressions 19 | //! 20 | //! Currently supported block compressions are: 21 | //! 22 | //! * BC1 23 | //! * BC2 24 | //! * BC3 25 | //! * BC4 26 | //! * BC5 27 | //! * BC6H 28 | //! * BC7 29 | 30 | #![cfg_attr(docsrs, feature(doc_cfg))] 31 | 32 | #[cfg(all( 33 | feature = "wgpu", 34 | any(feature = "bc15", feature = "bc6h", feature = "bc7") 35 | ))] 36 | mod block_compressor; 37 | pub mod decode; 38 | pub mod encode; 39 | mod settings; 40 | 41 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 42 | use std::hash::{Hash, Hasher}; 43 | 44 | #[cfg(all( 45 | feature = "wgpu", 46 | any(feature = "bc15", feature = "bc6h", feature = "bc7") 47 | ))] 48 | #[cfg_attr( 49 | docsrs, 50 | doc(cfg(all( 51 | feature = "wgpu", 52 | any(feature = "bc15", feature = "bc6h", feature = "bc7") 53 | ))) 54 | )] 55 | pub use block_compressor::GpuBlockCompressor; 56 | pub use bytemuck; 57 | #[cfg(feature = "bc6h")] 58 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 59 | pub use half; 60 | #[cfg(feature = "bc6h")] 61 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 62 | pub use settings::BC6HSettings; 63 | #[cfg(feature = "bc7")] 64 | #[cfg_attr(docsrs, doc(cfg(feature = "bc7")))] 65 | pub use settings::BC7Settings; 66 | 67 | /// Block compression variants supported by this crate. 68 | #[derive(Copy, Clone, Debug)] 69 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 70 | #[cfg_attr( 71 | docsrs, 72 | doc(cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))) 73 | )] 74 | pub enum CompressionVariant { 75 | #[cfg(feature = "bc15")] 76 | #[cfg_attr(docsrs, doc(cfg(feature = "bc15")))] 77 | /// BC1 compression (RGB) 78 | BC1, 79 | #[cfg(feature = "bc15")] 80 | #[cfg_attr(docsrs, doc(cfg(feature = "bc15")))] 81 | /// BC2 compression with sharp alpha (RGBA) 82 | BC2, 83 | #[cfg(feature = "bc15")] 84 | #[cfg_attr(docsrs, doc(cfg(feature = "bc15")))] 85 | /// BC3 compression with smooth alpha (RGBA) 86 | BC3, 87 | #[cfg(feature = "bc15")] 88 | #[cfg_attr(docsrs, doc(cfg(feature = "bc15")))] 89 | /// BC4 compression (R) 90 | BC4, 91 | #[cfg(feature = "bc15")] 92 | #[cfg_attr(docsrs, doc(cfg(feature = "bc15")))] 93 | /// BC5 compression (RG) 94 | BC5, 95 | #[cfg(feature = "bc6h")] 96 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 97 | /// BC6H compression (RGB) 98 | BC6H(BC6HSettings), 99 | #[cfg(feature = "bc7")] 100 | #[cfg_attr(docsrs, doc(cfg(feature = "bc7")))] 101 | /// BC7 compression with smooth alpha (RGBA) 102 | BC7(BC7Settings), 103 | } 104 | 105 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 106 | impl PartialEq for CompressionVariant { 107 | fn eq(&self, other: &Self) -> bool { 108 | std::mem::discriminant(self) == std::mem::discriminant(other) 109 | } 110 | } 111 | 112 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 113 | impl Eq for CompressionVariant {} 114 | 115 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 116 | impl Hash for CompressionVariant { 117 | fn hash(&self, state: &mut H) { 118 | std::mem::discriminant(self).hash(state); 119 | } 120 | } 121 | 122 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 123 | impl CompressionVariant { 124 | /// Returns the bytes per row for the given width. 125 | /// 126 | /// The width is used to calculate how many blocks are needed per row, 127 | /// which is then multiplied by the block size. 128 | /// Width is rounded up to the nearest multiple of 4. 129 | pub const fn bytes_per_row(self, width: u32) -> u32 { 130 | let blocks_per_row = width.div_ceil(4); 131 | blocks_per_row * self.block_byte_size() 132 | } 133 | 134 | /// Returns the byte size required for storing compressed blocks for the given dimensions. 135 | /// 136 | /// The size is calculated based on the block compression format and rounded up dimensions. 137 | /// Width and height are rounded up to the nearest multiple of 4. 138 | pub const fn blocks_byte_size(self, width: u32, height: u32) -> usize { 139 | let block_width = (width as usize).div_ceil(4); 140 | let block_height = (height as usize).div_ceil(4); 141 | let block_count = block_width * block_height; 142 | let block_size = self.block_byte_size() as usize; 143 | block_count * block_size 144 | } 145 | 146 | const fn block_byte_size(self) -> u32 { 147 | match self { 148 | #[cfg(feature = "bc15")] 149 | Self::BC1 | Self::BC4 => 8, 150 | #[cfg(feature = "bc15")] 151 | Self::BC2 | Self::BC3 | Self::BC5 => 16, 152 | #[cfg(feature = "bc6h")] 153 | Self::BC6H(..) => 16, 154 | #[cfg(feature = "bc7")] 155 | Self::BC7(..) => 16, 156 | } 157 | } 158 | 159 | #[cfg(feature = "wgpu")] 160 | const fn name(self) -> &'static str { 161 | match self { 162 | #[cfg(feature = "bc15")] 163 | Self::BC1 => "bc1", 164 | #[cfg(feature = "bc15")] 165 | Self::BC2 => "bc2", 166 | #[cfg(feature = "bc15")] 167 | Self::BC3 => "bc3", 168 | #[cfg(feature = "bc15")] 169 | Self::BC4 => "bc4", 170 | #[cfg(feature = "bc15")] 171 | Self::BC5 => "bc5", 172 | #[cfg(feature = "bc6h")] 173 | Self::BC6H(..) => "bc6h", 174 | #[cfg(feature = "bc7")] 175 | Self::BC7(..) => "bc7", 176 | } 177 | } 178 | 179 | #[cfg(feature = "wgpu")] 180 | const fn entry_point(self) -> &'static str { 181 | match self { 182 | #[cfg(feature = "bc15")] 183 | Self::BC1 => "compress_bc1", 184 | #[cfg(feature = "bc15")] 185 | Self::BC2 => "compress_bc2", 186 | #[cfg(feature = "bc15")] 187 | Self::BC3 => "compress_bc3", 188 | #[cfg(feature = "bc15")] 189 | Self::BC4 => "compress_bc4", 190 | #[cfg(feature = "bc15")] 191 | Self::BC5 => "compress_bc5", 192 | #[cfg(feature = "bc6h")] 193 | Self::BC6H(..) => "compress_bc6h", 194 | #[cfg(feature = "bc7")] 195 | Self::BC7(..) => "compress_bc7", 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /tests/common/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | sync::{Arc, LazyLock}, 3 | time::Duration, 4 | }; 5 | 6 | use block_compression::CompressionVariant; 7 | use half::f16; 8 | use image::ImageReader; 9 | use pollster::block_on; 10 | use wgpu::{ 11 | util::{DeviceExt, TextureDataOrder}, 12 | wgt::{Dx12SwapchainKind, Dx12UseFrameLatencyWaitableObject}, 13 | BackendOptions, Backends, Buffer, BufferDescriptor, BufferUsages, CommandEncoderDescriptor, 14 | Device, DeviceDescriptor, Dx12BackendOptions, Dx12Compiler, Error, ExperimentalFeatures, 15 | Extent3d, Features, Instance, InstanceDescriptor, InstanceFlags, Limits, MapMode, MemoryHints, 16 | PollType, PowerPreference, Queue, Texture, TextureDescriptor, TextureDimension, TextureFormat, 17 | TextureUsages, Trace, 18 | }; 19 | 20 | #[inline] 21 | pub fn srgb_to_linear(srgb: u8) -> f64 { 22 | let v = (srgb as f64) / 255.0; 23 | if v <= 0.04045 { 24 | v / 12.92 25 | } else { 26 | ((v + 0.055) / 1.055).powf(2.4) 27 | } 28 | } 29 | 30 | pub const BRICK_FILE_PATH: &str = "tests/images/brick.png"; 31 | pub const MARBLE_FILE_PATH: &str = "tests/images/marble.png"; 32 | 33 | pub fn create_wgpu_resources() -> (Device, Queue) { 34 | static CACHE: LazyLock<(Device, Queue)> = LazyLock::new(|| { 35 | let instance = Instance::new(&InstanceDescriptor { 36 | backends: Backends::from_env().unwrap_or_default(), 37 | flags: InstanceFlags::from_build_config().with_env(), 38 | memory_budget_thresholds: Default::default(), 39 | backend_options: BackendOptions { 40 | dx12: Dx12BackendOptions { 41 | shader_compiler: Dx12Compiler::StaticDxc, 42 | presentation_system: Dx12SwapchainKind::DxgiFromHwnd, 43 | latency_waitable_object: Dx12UseFrameLatencyWaitableObject::Wait, 44 | } 45 | .with_env(), 46 | ..Default::default() 47 | }, 48 | }); 49 | 50 | let adapter = block_on(instance.request_adapter(&wgpu::RequestAdapterOptions { 51 | power_preference: PowerPreference::HighPerformance, 52 | compatible_surface: None, 53 | force_fallback_adapter: false, 54 | })) 55 | .expect("Failed to find an appropriate adapter"); 56 | 57 | let (device, queue) = block_on(adapter.request_device(&DeviceDescriptor { 58 | label: Some("main device"), 59 | required_features: Features::default(), 60 | required_limits: Limits::default(), 61 | experimental_features: ExperimentalFeatures::disabled(), 62 | memory_hints: MemoryHints::Performance, 63 | trace: Trace::Off, 64 | })) 65 | .expect("Failed to create device"); 66 | device.on_uncaptured_error(Arc::new(error_handler)); 67 | 68 | (device, queue) 69 | }); 70 | 71 | CACHE.clone() 72 | } 73 | 74 | pub fn error_handler(error: Error) { 75 | let (message_type, message) = match error { 76 | Error::OutOfMemory { source } => ("OutOfMemory", source.to_string()), 77 | Error::Validation { 78 | source, 79 | description, 80 | } => ("Validation", format!("{source}: {description}")), 81 | Error::Internal { 82 | source, 83 | description, 84 | } => ("Internal", format!("{source}: {description}")), 85 | }; 86 | 87 | panic!("wgpu [{message_type}] [error]: {message}"); 88 | } 89 | 90 | pub fn read_image_and_create_texture( 91 | device: &Device, 92 | queue: &Queue, 93 | file_path: &str, 94 | variant: CompressionVariant, 95 | ) -> (Texture, Vec) { 96 | let image = ImageReader::open(file_path) 97 | .expect("can't open input image") 98 | .decode() 99 | .expect("can't decode image"); 100 | 101 | let rgba_image = image.to_rgba8(); 102 | let width = rgba_image.width(); 103 | let height = rgba_image.height(); 104 | 105 | let texture = if matches!(variant, CompressionVariant::BC6H(..)) { 106 | let rgba_f16_data: Vec = rgba_image 107 | .iter() 108 | .flat_map(|color| f16::from_f64(srgb_to_linear(*color)).to_ne_bytes()) 109 | .collect(); 110 | 111 | device.create_texture_with_data( 112 | queue, 113 | &TextureDescriptor { 114 | label: Some(file_path), 115 | size: Extent3d { 116 | width, 117 | height, 118 | depth_or_array_layers: 1, 119 | }, 120 | mip_level_count: 1, 121 | sample_count: 1, 122 | dimension: TextureDimension::D2, 123 | format: TextureFormat::Rgba16Float, 124 | usage: TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING, 125 | view_formats: &[], 126 | }, 127 | TextureDataOrder::LayerMajor, 128 | rgba_f16_data.as_slice(), 129 | ) 130 | } else { 131 | device.create_texture_with_data( 132 | queue, 133 | &TextureDescriptor { 134 | label: Some(file_path), 135 | size: Extent3d { 136 | width, 137 | height, 138 | depth_or_array_layers: 1, 139 | }, 140 | mip_level_count: 1, 141 | sample_count: 1, 142 | dimension: TextureDimension::D2, 143 | format: TextureFormat::Rgba8Unorm, 144 | usage: TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING, 145 | view_formats: &[], 146 | }, 147 | TextureDataOrder::LayerMajor, 148 | &rgba_image, 149 | ) 150 | }; 151 | 152 | (texture, rgba_image.to_vec()) 153 | } 154 | 155 | pub fn create_blocks_buffer(device: &Device, size: u64) -> Buffer { 156 | device.create_buffer(&BufferDescriptor { 157 | label: Some("blocks buffer"), 158 | size, 159 | usage: BufferUsages::COPY_SRC | BufferUsages::STORAGE, 160 | mapped_at_creation: false, 161 | }) 162 | } 163 | 164 | pub fn download_blocks_data(device: &Device, queue: &Queue, block_buffer: Buffer) -> Vec { 165 | let size = block_buffer.size(); 166 | 167 | let staging_buffer = device.create_buffer(&BufferDescriptor { 168 | label: Some("staging buffer"), 169 | size, 170 | usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ, 171 | mapped_at_creation: false, 172 | }); 173 | 174 | let mut copy_encoder = device.create_command_encoder(&CommandEncoderDescriptor { 175 | label: Some("copy encoder"), 176 | }); 177 | 178 | copy_encoder.copy_buffer_to_buffer(&block_buffer, 0, &staging_buffer, 0, size); 179 | 180 | queue.submit([copy_encoder.finish()]); 181 | 182 | let result; 183 | 184 | { 185 | let buffer_slice = staging_buffer.slice(..); 186 | 187 | let (tx, rx) = std::sync::mpsc::channel(); 188 | buffer_slice.map_async(MapMode::Read, move |v| tx.send(v).unwrap()); 189 | 190 | let _ = device.poll(PollType::Wait { 191 | submission_index: None, 192 | timeout: Some(Duration::from_secs(60)), 193 | }); 194 | 195 | match rx.recv() { 196 | Ok(Ok(())) => { 197 | result = buffer_slice.get_mapped_range().to_vec(); 198 | } 199 | _ => panic!("couldn't read from buffer"), 200 | } 201 | } 202 | 203 | staging_buffer.unmap(); 204 | 205 | result 206 | } 207 | -------------------------------------------------------------------------------- /src/settings.rs: -------------------------------------------------------------------------------- 1 | #[cfg(any(feature = "bc6h", feature = "bc7"))] 2 | use bytemuck::{Pod, Zeroable}; 3 | 4 | /// Encoding settings for BC6H. 5 | #[cfg(feature = "bc6h")] 6 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 7 | #[derive(Copy, Clone, Debug, Eq, PartialEq, Pod, Zeroable)] 8 | #[repr(C)] 9 | pub struct BC6HSettings { 10 | pub(crate) slow_mode: u32, 11 | pub(crate) fast_mode: u32, 12 | pub(crate) refine_iterations_1p: u32, 13 | pub(crate) refine_iterations_2p: u32, 14 | pub(crate) fast_skip_threshold: u32, 15 | } 16 | 17 | #[cfg(feature = "bc6h")] 18 | impl BC6HSettings { 19 | /// Very fast settings. 20 | pub const fn very_fast() -> Self { 21 | Self { 22 | slow_mode: false as _, 23 | fast_mode: true as _, 24 | fast_skip_threshold: 0, 25 | refine_iterations_1p: 0, 26 | refine_iterations_2p: 0, 27 | } 28 | } 29 | 30 | /// Fast settings. 31 | pub const fn fast() -> Self { 32 | Self { 33 | slow_mode: false as _, 34 | fast_mode: true as _, 35 | fast_skip_threshold: 2, 36 | refine_iterations_1p: 0, 37 | refine_iterations_2p: 1, 38 | } 39 | } 40 | 41 | /// Basic settings. 42 | pub const fn basic() -> Self { 43 | Self { 44 | slow_mode: false as _, 45 | fast_mode: false as _, 46 | fast_skip_threshold: 4, 47 | refine_iterations_1p: 2, 48 | refine_iterations_2p: 2, 49 | } 50 | } 51 | 52 | /// Slow settings. 53 | pub const fn slow() -> Self { 54 | Self { 55 | slow_mode: true as _, 56 | fast_mode: false as _, 57 | fast_skip_threshold: 10, 58 | refine_iterations_1p: 2, 59 | refine_iterations_2p: 2, 60 | } 61 | } 62 | 63 | /// Very slow settings. 64 | pub const fn very_slow() -> Self { 65 | Self { 66 | slow_mode: true as _, 67 | fast_mode: false as _, 68 | fast_skip_threshold: 32, 69 | refine_iterations_1p: 2, 70 | refine_iterations_2p: 2, 71 | } 72 | } 73 | } 74 | 75 | #[cfg(feature = "bc7")] 76 | #[cfg_attr(docsrs, doc(cfg(feature = "bc7")))] 77 | /// Encoding settings for BC7. 78 | #[derive(Copy, Clone, Debug, Eq, PartialEq, Pod, Zeroable)] 79 | #[repr(C)] 80 | pub struct BC7Settings { 81 | pub(crate) refine_iterations: [u32; 8], 82 | pub(crate) mode_selection: [u32; 4], 83 | pub(crate) skip_mode2: u32, 84 | pub(crate) fast_skip_threshold_mode1: u32, 85 | pub(crate) fast_skip_threshold_mode3: u32, 86 | pub(crate) fast_skip_threshold_mode7: u32, 87 | pub(crate) mode45_channel0: u32, 88 | pub(crate) refine_iterations_channel: u32, 89 | pub(crate) channels: u32, 90 | } 91 | 92 | #[cfg(feature = "bc7")] 93 | #[cfg_attr(docsrs, doc(cfg(feature = "bc7")))] 94 | impl BC7Settings { 95 | /// Opaque ultra fast settings. 96 | pub const fn opaque_ultra_fast() -> Self { 97 | Self { 98 | channels: 3, 99 | mode_selection: [false as _, false as _, false as _, true as _], 100 | skip_mode2: true as _, 101 | fast_skip_threshold_mode1: 3, 102 | fast_skip_threshold_mode3: 1, 103 | fast_skip_threshold_mode7: 0, 104 | mode45_channel0: 0, 105 | refine_iterations_channel: 0, 106 | refine_iterations: [2, 2, 2, 1, 2, 2, 1, 0], 107 | } 108 | } 109 | 110 | /// Opaque very fast settings. 111 | pub const fn opaque_very_fast() -> Self { 112 | Self { 113 | channels: 3, 114 | mode_selection: [false as _, true as _, false as _, true as _], 115 | skip_mode2: true as _, 116 | fast_skip_threshold_mode1: 3, 117 | fast_skip_threshold_mode3: 1, 118 | fast_skip_threshold_mode7: 0, 119 | mode45_channel0: 0, 120 | refine_iterations_channel: 0, 121 | refine_iterations: [2, 2, 2, 1, 2, 2, 1, 0], 122 | } 123 | } 124 | 125 | /// Opaque fast settings. 126 | pub const fn opaque_fast() -> Self { 127 | Self { 128 | channels: 3, 129 | mode_selection: [false as _, true as _, false as _, true as _], 130 | skip_mode2: true as _, 131 | fast_skip_threshold_mode1: 12, 132 | fast_skip_threshold_mode3: 4, 133 | fast_skip_threshold_mode7: 0, 134 | mode45_channel0: 0, 135 | refine_iterations_channel: 0, 136 | refine_iterations: [2, 2, 2, 1, 2, 2, 2, 0], 137 | } 138 | } 139 | 140 | /// Opaque basic settings. 141 | pub const fn opaque_basic() -> Self { 142 | Self { 143 | channels: 3, 144 | mode_selection: [true as _, true as _, true as _, true as _], 145 | skip_mode2: true as _, 146 | fast_skip_threshold_mode1: 12, 147 | fast_skip_threshold_mode3: 8, 148 | fast_skip_threshold_mode7: 0, 149 | mode45_channel0: 0, 150 | refine_iterations_channel: 2, 151 | refine_iterations: [2, 2, 2, 2, 2, 2, 2, 0], 152 | } 153 | } 154 | 155 | /// Opaque slow settings. 156 | pub const fn opaque_slow() -> Self { 157 | Self { 158 | channels: 3, 159 | mode_selection: [true as _, true as _, true as _, true as _], 160 | skip_mode2: false as _, 161 | fast_skip_threshold_mode1: 64, 162 | fast_skip_threshold_mode3: 64, 163 | fast_skip_threshold_mode7: 0, 164 | mode45_channel0: 0, 165 | refine_iterations_channel: 4, 166 | refine_iterations: [4, 4, 4, 4, 4, 4, 4, 0], 167 | } 168 | } 169 | 170 | /// Alpha ultra fast settings. 171 | pub const fn alpha_ultrafast() -> Self { 172 | Self { 173 | channels: 4, 174 | mode_selection: [false as _, false as _, true as _, true as _], 175 | skip_mode2: true as _, 176 | fast_skip_threshold_mode1: 0, 177 | fast_skip_threshold_mode3: 0, 178 | fast_skip_threshold_mode7: 4, 179 | mode45_channel0: 3, 180 | refine_iterations_channel: 1, 181 | refine_iterations: [2, 1, 2, 1, 1, 1, 2, 2], 182 | } 183 | } 184 | 185 | /// Alpha very fast settings. 186 | pub const fn alpha_very_fast() -> Self { 187 | Self { 188 | channels: 4, 189 | mode_selection: [false as _, true as _, true as _, true as _], 190 | skip_mode2: true as _, 191 | fast_skip_threshold_mode1: 0, 192 | fast_skip_threshold_mode3: 0, 193 | fast_skip_threshold_mode7: 4, 194 | mode45_channel0: 3, 195 | refine_iterations_channel: 2, 196 | refine_iterations: [2, 1, 2, 1, 2, 2, 2, 2], 197 | } 198 | } 199 | 200 | /// Alpha fast settings. 201 | pub const fn alpha_fast() -> Self { 202 | Self { 203 | channels: 4, 204 | mode_selection: [false as _, true as _, true as _, true as _], 205 | skip_mode2: true as _, 206 | fast_skip_threshold_mode1: 4, 207 | fast_skip_threshold_mode3: 4, 208 | fast_skip_threshold_mode7: 8, 209 | mode45_channel0: 3, 210 | refine_iterations_channel: 2, 211 | refine_iterations: [2, 1, 2, 1, 2, 2, 2, 2], 212 | } 213 | } 214 | 215 | /// Alpha basic settings. 216 | pub const fn alpha_basic() -> Self { 217 | Self { 218 | channels: 4, 219 | mode_selection: [true as _, true as _, true as _, true as _], 220 | skip_mode2: true as _, 221 | fast_skip_threshold_mode1: 12, 222 | fast_skip_threshold_mode3: 8, 223 | fast_skip_threshold_mode7: 8, 224 | mode45_channel0: 0, 225 | refine_iterations_channel: 2, 226 | refine_iterations: [2, 2, 2, 2, 2, 2, 2, 2], 227 | } 228 | } 229 | 230 | /// Alpha slow settings. 231 | pub const fn alpha_slow() -> Self { 232 | Self { 233 | channels: 4, 234 | mode_selection: [true as _, true as _, true as _, true as _], 235 | skip_mode2: false as _, 236 | fast_skip_threshold_mode1: 64, 237 | fast_skip_threshold_mode3: 64, 238 | fast_skip_threshold_mode7: 64, 239 | mode45_channel0: 0, 240 | refine_iterations_channel: 4, 241 | refine_iterations: [4, 4, 4, 4, 4, 4, 4, 4], 242 | } 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /src/encode/bc1_to_5.rs: -------------------------------------------------------------------------------- 1 | pub(crate) struct BlockCompressorBC15 { 2 | block: [f32; 64], 3 | } 4 | 5 | impl Default for BlockCompressorBC15 { 6 | fn default() -> Self { 7 | Self { block: [0.0; 64] } 8 | } 9 | } 10 | 11 | impl BlockCompressorBC15 { 12 | pub(crate) fn load_block_interleaved_rgba( 13 | &mut self, 14 | rgba_data: &[u8], 15 | xx: usize, 16 | yy: usize, 17 | stride: usize, 18 | ) { 19 | for y in 0..4 { 20 | for x in 0..4 { 21 | let pixel_x = xx * 4 + x; 22 | let pixel_y = yy * 4 + y; 23 | 24 | let offset = pixel_y * stride + pixel_x * 4; 25 | 26 | let red = rgba_data[offset] as f32; 27 | let green = rgba_data[offset + 1] as f32; 28 | let blue = rgba_data[offset + 2] as f32; 29 | let alpha = rgba_data[offset + 3] as f32; 30 | 31 | self.block[y * 4 + x] = red; 32 | self.block[16 + y * 4 + x] = green; 33 | self.block[32 + y * 4 + x] = blue; 34 | self.block[48 + y * 4 + x] = alpha; 35 | } 36 | } 37 | } 38 | 39 | pub(crate) fn load_block_r_8bit( 40 | &mut self, 41 | rgba_data: &[u8], 42 | xx: usize, 43 | yy: usize, 44 | stride: usize, 45 | ) { 46 | for y in 0..4 { 47 | for x in 0..4 { 48 | let pixel_x = xx * 4 + x; 49 | let pixel_y = yy * 4 + y; 50 | 51 | let offset = pixel_y * stride + pixel_x * 4; 52 | let red = rgba_data[offset] as f32; 53 | 54 | self.block[48 + y * 4 + x] = red; 55 | } 56 | } 57 | } 58 | 59 | pub(crate) fn load_block_g_8bit( 60 | &mut self, 61 | rgba_data: &[u8], 62 | xx: usize, 63 | yy: usize, 64 | stride: usize, 65 | ) { 66 | for y in 0..4 { 67 | for x in 0..4 { 68 | let pixel_x = xx * 4 + x; 69 | let pixel_y = yy * 4 + y; 70 | 71 | let offset = pixel_y * stride + pixel_x * 4; 72 | let green = rgba_data[offset + 1] as f32; 73 | 74 | self.block[48 + y * 4 + x] = green; 75 | } 76 | } 77 | } 78 | 79 | pub(crate) fn load_block_alpha_4bit( 80 | &mut self, 81 | rgba_data: &[u8], 82 | xx: usize, 83 | yy: usize, 84 | stride: usize, 85 | ) -> [u32; 2] { 86 | let mut alpha_bits = [0; 2]; 87 | 88 | for y in 0..4 { 89 | for x in 0..4 { 90 | let pixel_x = xx * 4 + x; 91 | let pixel_y = yy * 4 + y; 92 | 93 | let offset = pixel_y * stride + pixel_x * 4; 94 | let alpha = rgba_data[offset + 3] as f32 / 255.0; 95 | 96 | // Convert alpha to 4 bits (0-15) 97 | let alpha4 = (alpha * 15.0) as u32; 98 | let bit_position = y * 16 + x * 4; 99 | 100 | if bit_position < 32 { 101 | alpha_bits[0] |= alpha4 << bit_position; 102 | } else { 103 | alpha_bits[1] |= alpha4 << (bit_position - 32); 104 | } 105 | } 106 | } 107 | 108 | alpha_bits 109 | } 110 | 111 | pub(crate) fn store_data( 112 | &self, 113 | blocks_buffer: &mut [u8], 114 | block_width: usize, 115 | xx: usize, 116 | yy: usize, 117 | data: &[u32], 118 | ) { 119 | let offset = (yy * block_width + xx) * (data.len() * 4); 120 | 121 | for (index, &value) in data.iter().enumerate() { 122 | let byte_offset = offset + index * 4; 123 | blocks_buffer[byte_offset] = value as u8; 124 | blocks_buffer[byte_offset + 1] = (value >> 8) as u8; 125 | blocks_buffer[byte_offset + 2] = (value >> 16) as u8; 126 | blocks_buffer[byte_offset + 3] = (value >> 24) as u8; 127 | } 128 | } 129 | 130 | fn compute_covar_dc(&self, covar: &mut [f32; 6], dc: &mut [f32; 3]) { 131 | for (p, value) in dc.iter_mut().enumerate() { 132 | let mut acc = 0.0; 133 | for k in 0..16 { 134 | acc += self.block[k + p * 16]; 135 | } 136 | *value = acc / 16.0; 137 | } 138 | 139 | let mut covar0 = 0.0; 140 | let mut covar1 = 0.0; 141 | let mut covar2 = 0.0; 142 | let mut covar3 = 0.0; 143 | let mut covar4 = 0.0; 144 | let mut covar5 = 0.0; 145 | 146 | for k in 0..16 { 147 | let rgb0 = self.block[k] - dc[0]; 148 | let rgb1 = self.block[k + 16] - dc[1]; 149 | let rgb2 = self.block[k + 32] - dc[2]; 150 | 151 | covar0 += rgb0 * rgb0; 152 | covar1 += rgb0 * rgb1; 153 | covar2 += rgb0 * rgb2; 154 | covar3 += rgb1 * rgb1; 155 | covar4 += rgb1 * rgb2; 156 | covar5 += rgb2 * rgb2; 157 | } 158 | 159 | covar[0] = covar0; 160 | covar[1] = covar1; 161 | covar[2] = covar2; 162 | covar[3] = covar3; 163 | covar[4] = covar4; 164 | covar[5] = covar5; 165 | } 166 | 167 | fn ssymv(result: &mut [f32; 3], covar: &[f32; 6], a_vector: &[f32; 3]) { 168 | result[0] = covar[0] * a_vector[0] + covar[1] * a_vector[1] + covar[2] * a_vector[2]; 169 | result[1] = covar[1] * a_vector[0] + covar[3] * a_vector[1] + covar[4] * a_vector[2]; 170 | result[2] = covar[2] * a_vector[0] + covar[4] * a_vector[1] + covar[5] * a_vector[2]; 171 | } 172 | 173 | fn compute_axis3(axis: &mut [f32; 3], covar: &[f32; 6], power_iterations: i32) { 174 | let mut a_vector = [1.0; 3]; 175 | 176 | for i in 0..power_iterations { 177 | Self::ssymv(axis, covar, &a_vector); 178 | 179 | a_vector.copy_from_slice(&axis[..]); 180 | 181 | if i % 2 == 1 { 182 | let mut norm_sq = 0.0; 183 | for value in axis.iter() { 184 | norm_sq += value * value; 185 | } 186 | 187 | let rnorm = 1.0 / norm_sq.sqrt(); 188 | 189 | for value in a_vector.iter_mut() { 190 | *value *= rnorm; 191 | } 192 | } 193 | } 194 | 195 | axis.copy_from_slice(&a_vector); 196 | } 197 | 198 | fn pick_endpoints(&self, c0: &mut [f32; 3], c1: &mut [f32; 3], axis: &[f32; 3], dc: &[f32; 3]) { 199 | let mut min_dot: f32 = 256.0 * 256.0; 200 | let mut max_dot: f32 = 0.0; 201 | 202 | for y in 0..4 { 203 | for x in 0..4 { 204 | let mut dot = 0.0; 205 | for p in 0..3 { 206 | dot += (self.block[p * 16 + y * 4 + x] - dc[p]) * axis[p]; 207 | } 208 | 209 | min_dot = f32::min(min_dot, dot); 210 | max_dot = f32::max(max_dot, dot); 211 | } 212 | } 213 | 214 | if max_dot - min_dot < 1.0 { 215 | min_dot -= 0.5; 216 | max_dot += 0.5; 217 | } 218 | 219 | let mut norm_sq = 0.0; 220 | for value in axis.iter() { 221 | norm_sq += *value * *value; 222 | } 223 | 224 | let rnorm_sq = norm_sq.recip(); 225 | for p in 0..3 { 226 | c0[p] = f32::clamp(dc[p] + min_dot * rnorm_sq * axis[p], 0.0, 255.0); 227 | c1[p] = f32::clamp(dc[p] + max_dot * rnorm_sq * axis[p], 0.0, 255.0); 228 | } 229 | } 230 | 231 | fn dec_rgb565(c: &mut [f32; 3], p: i32) { 232 | let b5 = p & 31; 233 | let g6 = (p >> 5) & 63; 234 | let r5 = (p >> 11) & 31; 235 | 236 | c[0] = ((r5 << 3) + (r5 >> 2)) as f32; 237 | c[1] = ((g6 << 2) + (g6 >> 4)) as f32; 238 | c[2] = ((b5 << 3) + (b5 >> 2)) as f32; 239 | } 240 | 241 | fn enc_rgb565(c: &[f32; 3]) -> i32 { 242 | let r = c[0] as i32; 243 | let g = c[1] as i32; 244 | let b = c[2] as i32; 245 | 246 | let r5 = (r * 31 + 128 + ((r * 31) >> 8)) >> 8; 247 | let g6 = (g * 63 + 128 + ((g * 63) >> 8)) >> 8; 248 | let b5 = (b * 31 + 128 + ((b * 31) >> 8)) >> 8; 249 | 250 | (r5 << 11) + (g6 << 5) + b5 251 | } 252 | 253 | fn fast_quant(&self, p0: i32, p1: i32) -> u32 { 254 | let mut c0 = [0.0; 3]; 255 | let mut c1 = [0.0; 3]; 256 | Self::dec_rgb565(&mut c0, p0); 257 | Self::dec_rgb565(&mut c1, p1); 258 | 259 | let mut dir = [0.0; 3]; 260 | for p in 0..3 { 261 | dir[p] = c1[p] - c0[p]; 262 | } 263 | 264 | let mut sq_norm = 0.0; 265 | for value in dir.iter() { 266 | sq_norm += value.powi(2); 267 | } 268 | 269 | let rsq_norm = sq_norm.recip(); 270 | 271 | for value in dir.iter_mut() { 272 | *value *= rsq_norm * 3.0; 273 | } 274 | 275 | let mut bias = 0.5; 276 | for p in 0..3 { 277 | bias -= c0[p] * dir[p]; 278 | } 279 | 280 | let mut bits = 0; 281 | let mut scaler = 1; 282 | for k in 0..16 { 283 | let mut dot = 0.0; 284 | for (p, value) in dir.iter().enumerate() { 285 | dot += self.block[k + p * 16] * value; 286 | } 287 | 288 | let q = i32::clamp((dot + bias) as i32, 0, 3); 289 | bits += q as u32 * scaler; 290 | scaler = scaler.wrapping_mul(4); 291 | } 292 | 293 | bits 294 | } 295 | 296 | fn bc1_refine(&self, pe: &mut [i32; 2], bits: u32, dc: &[f32; 3]) { 297 | let mut c0 = [0.0; 3]; 298 | let mut c1 = [0.0; 3]; 299 | 300 | if (bits ^ (bits.wrapping_mul(4))) < 4 { 301 | c0.copy_from_slice(&dc[..]); 302 | c1.copy_from_slice(&dc[..]); 303 | } else { 304 | let mut atb1 = [0.0; 3]; 305 | let mut sum_q = 0.0; 306 | let mut sum_qq = 0.0; 307 | let mut shifted_bits = bits; 308 | 309 | for k in 0..16 { 310 | let q = (shifted_bits & 3) as f32; 311 | shifted_bits >>= 2; 312 | 313 | let x = 3.0 - q; 314 | 315 | sum_q += q; 316 | sum_qq += q * q; 317 | 318 | for (p, value) in atb1.iter_mut().enumerate() { 319 | *value += x * self.block[k + p * 16]; 320 | } 321 | } 322 | 323 | let mut sum = [0.0; 3]; 324 | let mut atb2 = [0.0; 3]; 325 | 326 | for p in 0..3 { 327 | sum[p] = dc[p] * 16.0; 328 | atb2[p] = 3.0 * sum[p] - atb1[p]; 329 | } 330 | 331 | let cxx = 16.0 * 9.0 - 2.0 * 3.0 * sum_q + sum_qq; 332 | let cyy = sum_qq; 333 | let cxy = 3.0 * sum_q - sum_qq; 334 | let scale = 3.0 * (cxx * cyy - cxy * cxy).recip(); 335 | 336 | for p in 0..3 { 337 | c0[p] = (atb1[p] * cyy - atb2[p] * cxy) * scale; 338 | c1[p] = (atb2[p] * cxx - atb1[p] * cxy) * scale; 339 | 340 | c0[p] = f32::clamp(c0[p], 0.0, 255.0); 341 | c1[p] = f32::clamp(c1[p], 0.0, 255.0); 342 | } 343 | } 344 | 345 | pe[0] = Self::enc_rgb565(&c0); 346 | pe[1] = Self::enc_rgb565(&c1); 347 | } 348 | 349 | fn fix_qbits(qbits: u32) -> u32 { 350 | const MASK_01B: u32 = 0x55555555; 351 | const MASK_10B: u32 = 0xAAAAAAAA; 352 | 353 | let qbits0 = qbits & MASK_01B; 354 | let qbits1 = qbits & MASK_10B; 355 | 356 | (qbits1 >> 1) + (qbits1 ^ (qbits0 << 1)) 357 | } 358 | 359 | pub(crate) fn compress_block_bc1_core(&self) -> [u32; 2] { 360 | let power_iterations = 4; 361 | let refine_iterations = 1; 362 | 363 | let mut covar = [0.0; 6]; 364 | let mut dc = [0.0; 3]; 365 | self.compute_covar_dc(&mut covar, &mut dc); 366 | 367 | const EPS: f32 = f32::EPSILON; 368 | covar[0] += EPS; 369 | covar[3] += EPS; 370 | covar[5] += EPS; 371 | 372 | let mut axis = [0.0; 3]; 373 | Self::compute_axis3(&mut axis, &covar, power_iterations); 374 | 375 | let mut c0 = [0.0; 3]; 376 | let mut c1 = [0.0; 3]; 377 | self.pick_endpoints(&mut c0, &mut c1, &axis, &dc); 378 | 379 | let mut p = [0; 2]; 380 | p[0] = Self::enc_rgb565(&c0); 381 | p[1] = Self::enc_rgb565(&c1); 382 | if p[0] < p[1] { 383 | p.swap(0, 1); 384 | } 385 | 386 | let mut data = [0; 2]; 387 | data[0] = ((p[1] as u32) << 16) | p[0] as u32; 388 | data[1] = self.fast_quant(p[0], p[1]); 389 | 390 | for _ in 0..refine_iterations { 391 | self.bc1_refine(&mut p, data[1], &dc); 392 | if p[0] < p[1] { 393 | p.swap(0, 1); 394 | } 395 | data[0] = ((p[1] as u32) << 16) | p[0] as u32; 396 | data[1] = self.fast_quant(p[0], p[1]); 397 | } 398 | 399 | data[1] = Self::fix_qbits(data[1]); 400 | 401 | data 402 | } 403 | 404 | pub(crate) fn compress_block_bc3_alpha(&self) -> [u32; 2] { 405 | let mut ep = [255.0, 0.0]; 406 | 407 | // Find min/max endpoints using block[48] to block[63] for alpha 408 | for k in 0..16 { 409 | ep[0] = f32::min(ep[0], self.block[48 + k]); 410 | ep[1] = f32::max(ep[1], self.block[48 + k]); 411 | } 412 | 413 | // Prevent division by zero 414 | if ep[0] == ep[1] { 415 | ep[1] = ep[0] + 0.1; 416 | } 417 | 418 | let mut qblock = [0; 2]; 419 | let scale = 7.0 / (ep[1] - ep[0]); 420 | 421 | for k in 0..16 { 422 | let v = self.block[48 + k]; 423 | let proj = (v - ep[0]) * scale + 0.5; 424 | 425 | let mut q = i32::clamp(proj as i32, 0, 7); 426 | q = 7 - q; 427 | 428 | if q > 0 { 429 | q += 1; 430 | } 431 | if q == 8 { 432 | q = 1; 433 | } 434 | 435 | qblock[k / 8] |= (q as u32) << ((k % 8) * 3); 436 | } 437 | 438 | let mut data = [0; 2]; 439 | data[0] = (u32::clamp(ep[0] as u32, 0, 255) << 8) | u32::clamp(ep[1] as u32, 0, 255); 440 | data[0] |= qblock[0] << 16; 441 | data[1] = qblock[0] >> 16; 442 | data[1] |= qblock[1] << 8; 443 | 444 | data 445 | } 446 | } 447 | -------------------------------------------------------------------------------- /compressor/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::File, 3 | path::PathBuf, 4 | sync::Arc, 5 | time::{Duration, Instant}, 6 | }; 7 | 8 | use block_compression::{ 9 | half::f16, BC6HSettings, BC7Settings, CompressionVariant, GpuBlockCompressor, 10 | }; 11 | use bytemuck::cast_slice; 12 | use ddsfile::{AlphaMode, D3D10ResourceDimension, Dds, DxgiFormat, NewDxgiParams}; 13 | use image::ImageReader; 14 | use pollster::block_on; 15 | use wgpu::{ 16 | util::{DeviceExt, TextureDataOrder}, 17 | wgt::{Dx12SwapchainKind, Dx12UseFrameLatencyWaitableObject}, 18 | BackendOptions, Backends, Buffer, BufferDescriptor, BufferUsages, CommandEncoderDescriptor, 19 | ComputePassDescriptor, ComputePassTimestampWrites, Device, DeviceDescriptor, 20 | Dx12BackendOptions, Dx12Compiler, Error, ExperimentalFeatures, Extent3d, Features, 21 | GlBackendOptions, Instance, InstanceDescriptor, InstanceFlags, MapMode, MemoryHints, 22 | NoopBackendOptions, PollType, PowerPreference, QueryType, Queue, Texture, TextureDescriptor, 23 | TextureDimension, TextureFormat, TextureUsages, TextureViewDescriptor, Trace, 24 | }; 25 | 26 | fn main() { 27 | let (variant, file_name) = match parse_args() { 28 | Some(args) => args, 29 | None => return, 30 | }; 31 | 32 | let (device, queue) = create_resources(); 33 | let mut compressor: GpuBlockCompressor = GpuBlockCompressor::new(device.clone(), queue.clone()); 34 | 35 | let start = Instant::now(); 36 | 37 | let texture = read_image_and_create_texture(&device, &queue, &file_name, variant); 38 | let texture_view = texture.create_view(&TextureViewDescriptor::default()); 39 | let width = texture.width(); 40 | let height = texture.height(); 41 | 42 | let duration = start.elapsed(); 43 | println!( 44 | "Image read and upload took: {:.3} ms", 45 | duration.as_secs_f64() * 1000.0 46 | ); 47 | 48 | let blocks_buffer = device.create_buffer(&BufferDescriptor { 49 | label: Some("blocks buffer"), 50 | size: variant.blocks_byte_size(width, height) as _, 51 | usage: BufferUsages::COPY_SRC | BufferUsages::STORAGE, 52 | mapped_at_creation: false, 53 | }); 54 | 55 | compressor.add_compression_task( 56 | variant, 57 | &texture_view, 58 | width, 59 | height, 60 | &blocks_buffer, 61 | None, 62 | None, 63 | ); 64 | 65 | compress(&mut compressor, &device, &queue); 66 | 67 | let start = Instant::now(); 68 | 69 | let block_data = download_blocks_data(&device, &queue, blocks_buffer); 70 | 71 | let duration = start.elapsed(); 72 | println!( 73 | "Block data download took: {:.3} ms", 74 | duration.as_secs_f64() * 1000.0 75 | ); 76 | 77 | let start = Instant::now(); 78 | 79 | write_dds_file(&file_name, variant, width, height, block_data); 80 | 81 | let duration = start.elapsed(); 82 | println!( 83 | "DDS output to disk took: {:.3} ms", 84 | duration.as_secs_f64() * 1000.0 85 | ); 86 | } 87 | 88 | fn create_resources() -> (Device, Queue) { 89 | let instance = Instance::new(&InstanceDescriptor { 90 | backends: Backends::from_env().unwrap_or_default(), 91 | flags: InstanceFlags::from_build_config().with_env(), 92 | memory_budget_thresholds: Default::default(), 93 | backend_options: BackendOptions { 94 | gl: GlBackendOptions::default(), 95 | dx12: Dx12BackendOptions { 96 | shader_compiler: Dx12Compiler::StaticDxc, 97 | presentation_system: Dx12SwapchainKind::DxgiFromHwnd, 98 | latency_waitable_object: Dx12UseFrameLatencyWaitableObject::Wait, 99 | } 100 | .with_env(), 101 | noop: NoopBackendOptions::default(), 102 | }, 103 | }); 104 | 105 | let adapter = block_on(instance.request_adapter(&wgpu::RequestAdapterOptions { 106 | power_preference: PowerPreference::HighPerformance, 107 | compatible_surface: None, 108 | force_fallback_adapter: false, 109 | })) 110 | .expect("Failed to find an appropriate adapter"); 111 | 112 | let (device, queue) = block_on(adapter.request_device(&DeviceDescriptor { 113 | label: Some("main device"), 114 | required_features: Features::TIMESTAMP_QUERY, 115 | required_limits: adapter.limits(), 116 | experimental_features: ExperimentalFeatures::disabled(), 117 | memory_hints: MemoryHints::MemoryUsage, 118 | trace: Trace::Off, 119 | })) 120 | .expect("Failed to create device"); 121 | device.on_uncaptured_error(Arc::new(error_handler)); 122 | 123 | let info = adapter.get_info(); 124 | println!("Using backend: {:?}", info.backend); 125 | 126 | (device, queue) 127 | } 128 | 129 | fn read_image_and_create_texture( 130 | device: &Device, 131 | queue: &Queue, 132 | file_name: &str, 133 | variant: CompressionVariant, 134 | ) -> Texture { 135 | let image = ImageReader::open(file_name) 136 | .expect("can't open input image") 137 | .decode() 138 | .expect("can't decode image"); 139 | 140 | let rgba_image = image.to_rgba8(); 141 | let width = rgba_image.width(); 142 | let height = rgba_image.height(); 143 | 144 | if matches!(variant, CompressionVariant::BC6H(..)) { 145 | let rgba_f16_data: Vec = rgba_image 146 | .iter() 147 | .flat_map(|color| f16::from_f64(srgb_to_linear(*color)).to_le_bytes()) 148 | .collect(); 149 | 150 | device.create_texture_with_data( 151 | queue, 152 | &TextureDescriptor { 153 | label: Some(file_name), 154 | size: Extent3d { 155 | width, 156 | height, 157 | depth_or_array_layers: 1, 158 | }, 159 | mip_level_count: 1, 160 | sample_count: 1, 161 | dimension: TextureDimension::D2, 162 | format: TextureFormat::Rgba16Float, 163 | usage: TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING, 164 | view_formats: &[], 165 | }, 166 | TextureDataOrder::LayerMajor, 167 | rgba_f16_data.as_slice(), 168 | ) 169 | } else { 170 | device.create_texture_with_data( 171 | queue, 172 | &TextureDescriptor { 173 | label: Some(file_name), 174 | size: Extent3d { 175 | width, 176 | height, 177 | depth_or_array_layers: 1, 178 | }, 179 | mip_level_count: 1, 180 | sample_count: 1, 181 | dimension: TextureDimension::D2, 182 | format: TextureFormat::Rgba8Unorm, 183 | usage: TextureUsages::COPY_DST | TextureUsages::TEXTURE_BINDING, 184 | view_formats: &[], 185 | }, 186 | TextureDataOrder::LayerMajor, 187 | &rgba_image, 188 | ) 189 | } 190 | } 191 | 192 | #[inline] 193 | pub fn srgb_to_linear(srgb: u8) -> f64 { 194 | let v = (srgb as f64) / 255.0; 195 | if v <= 0.04045 { 196 | v / 12.92 197 | } else { 198 | ((v + 0.055) / 1.055).powf(2.4) 199 | } 200 | } 201 | 202 | fn compress(compressor: &mut GpuBlockCompressor, device: &Device, queue: &Queue) { 203 | let timestamp_query_set = device.create_query_set(&wgpu::QuerySetDescriptor { 204 | label: Some("timestamp query set"), 205 | count: 2, 206 | ty: QueryType::Timestamp, 207 | }); 208 | 209 | let timestamp_resolve_buffer = device.create_buffer(&BufferDescriptor { 210 | label: Some("timestamp resolve buffer"), 211 | size: 16, 212 | usage: BufferUsages::COPY_DST | BufferUsages::COPY_SRC | BufferUsages::QUERY_RESOLVE, 213 | mapped_at_creation: false, 214 | }); 215 | 216 | let timestamp_readback_buffer = device.create_buffer(&BufferDescriptor { 217 | label: Some("timestamp read-back buffer"), 218 | size: 16, 219 | usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ, 220 | mapped_at_creation: false, 221 | }); 222 | 223 | let mut encoder = device.create_command_encoder(&CommandEncoderDescriptor { 224 | label: Some("command encoder"), 225 | }); 226 | 227 | { 228 | let mut pass = encoder.begin_compute_pass(&ComputePassDescriptor { 229 | label: Some("compute pass"), 230 | timestamp_writes: Some(ComputePassTimestampWrites { 231 | query_set: ×tamp_query_set, 232 | beginning_of_pass_write_index: Some(0), 233 | end_of_pass_write_index: Some(1), 234 | }), 235 | }); 236 | 237 | compressor.compress(&mut pass); 238 | } 239 | 240 | encoder.resolve_query_set(×tamp_query_set, 0..2, ×tamp_resolve_buffer, 0); 241 | 242 | encoder.copy_buffer_to_buffer( 243 | ×tamp_resolve_buffer, 244 | 0, 245 | ×tamp_readback_buffer, 246 | 0, 247 | 16, 248 | ); 249 | 250 | queue.submit([encoder.finish()]); 251 | 252 | { 253 | let buffer_slice = timestamp_readback_buffer.slice(..); 254 | 255 | let (tx, rx) = std::sync::mpsc::channel(); 256 | buffer_slice.map_async(MapMode::Read, move |v| tx.send(v).unwrap()); 257 | 258 | let _ = device.poll(PollType::Wait { 259 | submission_index: None, 260 | timeout: Some(Duration::from_secs(60)), 261 | }); 262 | 263 | match rx.recv() { 264 | Ok(Ok(())) => { 265 | let data = buffer_slice.get_mapped_range(); 266 | let timestamps: &[u64] = cast_slice(&data); 267 | 268 | let period = queue.get_timestamp_period() as f64; 269 | let start_ns = timestamps[0] as f64 * period; 270 | let end_ns = timestamps[1] as f64 * period; 271 | let duration_ms = (end_ns - start_ns) / 1_000_000.0; 272 | 273 | println!("Compression took: {duration_ms:.3} ms"); 274 | } 275 | _ => panic!("couldn't read from buffer"), 276 | } 277 | 278 | timestamp_readback_buffer.unmap(); 279 | } 280 | } 281 | 282 | fn download_blocks_data(device: &Device, queue: &Queue, block_buffer: Buffer) -> Vec { 283 | let size = block_buffer.size(); 284 | 285 | let staging_buffer = device.create_buffer(&BufferDescriptor { 286 | label: Some("staging buffer"), 287 | size, 288 | usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ, 289 | mapped_at_creation: false, 290 | }); 291 | 292 | let mut copy_encoder = device.create_command_encoder(&CommandEncoderDescriptor { 293 | label: Some("copy encoder"), 294 | }); 295 | 296 | copy_encoder.copy_buffer_to_buffer(&block_buffer, 0, &staging_buffer, 0, size); 297 | 298 | queue.submit([copy_encoder.finish()]); 299 | 300 | let result; 301 | 302 | { 303 | let buffer_slice = staging_buffer.slice(..); 304 | 305 | let (tx, rx) = std::sync::mpsc::channel(); 306 | buffer_slice.map_async(MapMode::Read, move |v| tx.send(v).unwrap()); 307 | 308 | let _ = device.poll(PollType::Wait { 309 | submission_index: None, 310 | timeout: Some(Duration::from_secs(60)), 311 | }); 312 | 313 | match rx.recv() { 314 | Ok(Ok(())) => { 315 | result = buffer_slice.get_mapped_range().to_vec(); 316 | } 317 | _ => panic!("couldn't read from buffer"), 318 | } 319 | } 320 | 321 | staging_buffer.unmap(); 322 | 323 | result 324 | } 325 | 326 | fn write_dds_file( 327 | file_name: &str, 328 | variant: CompressionVariant, 329 | width: u32, 330 | height: u32, 331 | block_data: Vec, 332 | ) { 333 | let mut dds = Dds::new_dxgi(NewDxgiParams { 334 | height, 335 | width, 336 | depth: None, 337 | format: dxgi_format(variant), 338 | mipmap_levels: Some(1), 339 | array_layers: None, 340 | caps2: None, 341 | is_cubemap: false, 342 | resource_dimension: D3D10ResourceDimension::Texture2D, 343 | alpha_mode: AlphaMode::Straight, 344 | }) 345 | .expect("failed to create DDS header"); 346 | 347 | dds.data = block_data; 348 | 349 | let mut dds_name = PathBuf::from(file_name); 350 | dds_name.set_extension("dds"); 351 | 352 | let mut file = File::create(dds_name).expect("failed to create output file"); 353 | dds.write(&mut file).expect("failed to write DDS file"); 354 | } 355 | 356 | fn dxgi_format(variant: CompressionVariant) -> DxgiFormat { 357 | match variant { 358 | CompressionVariant::BC1 => DxgiFormat::BC1_UNorm_sRGB, 359 | CompressionVariant::BC2 => DxgiFormat::BC2_UNorm_sRGB, 360 | CompressionVariant::BC3 => DxgiFormat::BC3_UNorm_sRGB, 361 | CompressionVariant::BC4 => DxgiFormat::BC4_UNorm, 362 | CompressionVariant::BC5 => DxgiFormat::BC5_UNorm, 363 | CompressionVariant::BC6H(..) => DxgiFormat::BC6H_UF16, 364 | CompressionVariant::BC7(..) => DxgiFormat::BC7_UNorm_sRGB, 365 | } 366 | } 367 | 368 | fn print_help() { 369 | println!("Usage: compressor "); 370 | println!("\nCompression variants:"); 371 | println!(" bc1 - BC1 compression (RGB)"); 372 | println!(" bc2 - BC2 compression with sharp alpha (RGBA)"); 373 | println!(" bc3 - BC3 compression with smooth alpha (RGBA)"); 374 | println!(" bc4 - BC4 compression (R)"); 375 | println!(" bc5 - BC5 compression (RG)"); 376 | println!(" bc6h - BC6H compression (RGB HDR)"); 377 | println!(" bc7 - BC7 compression with smooth alpha (RGBA)"); 378 | } 379 | 380 | fn parse_args() -> Option<(CompressionVariant, String)> { 381 | let args: Vec = std::env::args().collect(); 382 | 383 | if args.len() != 3 || args.contains(&"--help".to_string()) { 384 | print_help(); 385 | return None; 386 | } 387 | 388 | let variant = match args[1].to_lowercase().as_str() { 389 | "bc1" => CompressionVariant::BC1, 390 | "bc2" => CompressionVariant::BC2, 391 | "bc3" => CompressionVariant::BC3, 392 | "bc4" => CompressionVariant::BC4, 393 | "bc5" => CompressionVariant::BC5, 394 | "bc6h" => CompressionVariant::BC6H(BC6HSettings::very_slow()), 395 | "bc7" => CompressionVariant::BC7(BC7Settings::alpha_slow()), 396 | _ => { 397 | println!("Error: Invalid compression variant"); 398 | print_help(); 399 | return None; 400 | } 401 | }; 402 | 403 | let file_name = args[2].clone(); 404 | 405 | Some((variant, file_name)) 406 | } 407 | 408 | pub fn error_handler(error: Error) { 409 | let (message_type, message) = match error { 410 | Error::OutOfMemory { source } => ("OutOfMemory", source.to_string()), 411 | Error::Validation { 412 | source, 413 | description, 414 | } => ("Validation", format!("{source}: {description}")), 415 | Error::Internal { 416 | source, 417 | description, 418 | } => ("Internal", format!("{source}: {description}")), 419 | }; 420 | 421 | panic!("wgpu [{message_type}] [error]: {message}"); 422 | } 423 | -------------------------------------------------------------------------------- /tests/metrics.rs: -------------------------------------------------------------------------------- 1 | use block_compression::{ 2 | decode::decompress_blocks_as_rgba8, encode::compress_rgba8, BC6HSettings, BC7Settings, 3 | CompressionVariant, GpuBlockCompressor, 4 | }; 5 | use wgpu::{CommandEncoderDescriptor, ComputePassDescriptor, TextureViewDescriptor}; 6 | 7 | use self::common::{ 8 | create_blocks_buffer, create_wgpu_resources, download_blocks_data, 9 | read_image_and_create_texture, srgb_to_linear, BRICK_FILE_PATH, MARBLE_FILE_PATH, 10 | }; 11 | 12 | mod common; 13 | 14 | pub const BRICK_ALPHA_FILE_PATH: &str = "tests/images/brick-alpha.png"; 15 | pub const MARBLE_ALPHA_FILE_PATH: &str = "tests/images/marble-alpha.png"; 16 | pub const BLENDER_FILE_PATH: &str = "tests/images/blender.png"; 17 | 18 | #[derive(Debug, Clone)] 19 | pub struct PsnrResult { 20 | pub overall_psnr: f64, 21 | pub overall_mse: f64, 22 | pub channel_results: ChannelResults, 23 | } 24 | 25 | #[derive(Debug, Clone)] 26 | pub struct ChannelResults { 27 | pub red: ChannelMetrics, 28 | pub green: ChannelMetrics, 29 | pub blue: ChannelMetrics, 30 | pub alpha: ChannelMetrics, 31 | } 32 | 33 | #[derive(Debug, Clone)] 34 | pub struct ChannelMetrics { 35 | pub psnr: f64, 36 | pub mse: f64, 37 | } 38 | 39 | /// Calculates quality metrics for a given image. The input data and output data must be RGBA data. 40 | pub fn calculate_image_metrics_rgba8( 41 | original: &[u8], 42 | compressed: &[u8], 43 | width: u32, 44 | height: u32, 45 | channels: u32, 46 | ) -> PsnrResult { 47 | if original.len() != compressed.len() { 48 | panic!("Image buffers must have same length"); 49 | } 50 | if original.len() != (width * height * 4) as usize { 51 | panic!("Buffer size doesn't match dimensions"); 52 | } 53 | 54 | let mut channel_mse = [0.0; 4]; 55 | let pixel_count = (width * height) as f64; 56 | 57 | for index in (0..original.len()).step_by(4) { 58 | for channel in 0..channels as usize { 59 | let orig = if channel < 3 { 60 | srgb_to_linear(original[index + channel]) 61 | } else { 62 | (original[index + channel] as f64) / 255.0 63 | }; 64 | 65 | let comp = if channel < 3 { 66 | srgb_to_linear(compressed[index + channel]) 67 | } else { 68 | (compressed[index + channel] as f64) / 255.0 69 | }; 70 | 71 | let diff = orig - comp; 72 | channel_mse[channel] += diff * diff; 73 | } 74 | } 75 | 76 | // Normalize MSE values 77 | channel_mse.iter_mut().for_each(|mse| *mse /= pixel_count); 78 | 79 | let calculate_psnr = |mse: f64| -> f64 { 80 | if mse == 0.0 { 81 | 0.0 82 | } else { 83 | 20.0 * (1.0 / mse.sqrt()).log10() 84 | } 85 | }; 86 | 87 | let overall_mse = channel_mse.iter().sum::() / channels as f64; 88 | let overall_psnr = calculate_psnr(overall_mse); 89 | 90 | let channel_results = ChannelResults { 91 | red: ChannelMetrics { 92 | mse: channel_mse[0], 93 | psnr: calculate_psnr(channel_mse[0]), 94 | }, 95 | green: ChannelMetrics { 96 | mse: channel_mse[1], 97 | psnr: calculate_psnr(channel_mse[1]), 98 | }, 99 | blue: ChannelMetrics { 100 | mse: channel_mse[2], 101 | psnr: calculate_psnr(channel_mse[2]), 102 | }, 103 | alpha: ChannelMetrics { 104 | mse: channel_mse[3], 105 | psnr: calculate_psnr(channel_mse[3]), 106 | }, 107 | }; 108 | 109 | PsnrResult { 110 | overall_psnr, 111 | overall_mse, 112 | channel_results, 113 | } 114 | } 115 | 116 | fn print_metrics(name: &str, metrics: &PsnrResult) { 117 | println!("-----------------------"); 118 | println!("Image name: {name}"); 119 | println!("Overall PSNR: {:.4} dB", metrics.overall_psnr); 120 | println!("Overall MSE: {:.9}", metrics.overall_mse); 121 | println!( 122 | "Red channel PSNR: {:.4} dB", 123 | metrics.channel_results.red.psnr 124 | ); 125 | println!( 126 | "Green channel PSNR: {:.4} dB", 127 | metrics.channel_results.green.psnr 128 | ); 129 | println!( 130 | "Blue channel PSNR: {:.4} dB", 131 | metrics.channel_results.blue.psnr 132 | ); 133 | println!( 134 | "Alpha channel PSNR: {:.4} dB", 135 | metrics.channel_results.alpha.psnr 136 | ); 137 | println!("-----------------------"); 138 | } 139 | 140 | fn compress_image_reference( 141 | variant: CompressionVariant, 142 | width: u32, 143 | height: u32, 144 | data: &[u8], 145 | ) -> Vec { 146 | let output_size = variant.blocks_byte_size(width, height); 147 | let mut blocks = vec![0; output_size]; 148 | compress_rgba8(variant, data, &mut blocks, width, height, width * 4); 149 | blocks 150 | } 151 | 152 | fn compress_image(image_path: &str, variant: CompressionVariant) -> (u32, u32, Vec, Vec) { 153 | let (device, queue) = create_wgpu_resources(); 154 | let mut block_compressor = GpuBlockCompressor::new(device.clone(), queue.clone()); 155 | 156 | let (texture, original_data) = 157 | read_image_and_create_texture(&device, &queue, image_path, variant); 158 | let blocks_size = variant.blocks_byte_size(texture.width(), texture.height()); 159 | 160 | let blocks = create_blocks_buffer(&device, blocks_size as u64); 161 | 162 | block_compressor.add_compression_task( 163 | variant, 164 | &texture.create_view(&TextureViewDescriptor::default()), 165 | texture.width(), 166 | texture.height(), 167 | &blocks, 168 | None, 169 | None, 170 | ); 171 | 172 | let mut encoder = device.create_command_encoder(&CommandEncoderDescriptor { 173 | label: Some("command encoder"), 174 | }); 175 | 176 | { 177 | let mut pass = encoder.begin_compute_pass(&ComputePassDescriptor { 178 | label: Some("compute pass"), 179 | timestamp_writes: None, 180 | }); 181 | 182 | block_compressor.compress(&mut pass); 183 | } 184 | 185 | queue.submit([encoder.finish()]); 186 | 187 | let blocks_data = download_blocks_data(&device, &queue, blocks); 188 | 189 | ( 190 | texture.width(), 191 | texture.height(), 192 | original_data, 193 | blocks_data, 194 | ) 195 | } 196 | 197 | fn calculate_psnr( 198 | variant: CompressionVariant, 199 | channels: u32, 200 | width: u32, 201 | height: u32, 202 | original_data: &[u8], 203 | blocks_data: &[u8], 204 | ) -> PsnrResult { 205 | let size = width * height * 4; 206 | 207 | let mut decompressed_data = vec![0; size as usize]; 208 | decompress_blocks_as_rgba8(variant, width, height, blocks_data, &mut decompressed_data); 209 | 210 | calculate_image_metrics_rgba8(original_data, &decompressed_data, width, height, channels) 211 | } 212 | 213 | fn compare_psnr(image_path: &str, variant: CompressionVariant, channels: u32) { 214 | let image_name = std::path::Path::new(image_path) 215 | .file_name() 216 | .unwrap() 217 | .to_str() 218 | .unwrap(); 219 | 220 | let (width, height, original_data, blocks_data) = compress_image(image_path, variant); 221 | 222 | let psnr = calculate_psnr( 223 | variant, 224 | channels, 225 | width, 226 | height, 227 | &original_data, 228 | &blocks_data, 229 | ); 230 | 231 | let reference_block_data = compress_image_reference(variant, width, height, &original_data); 232 | 233 | let reference_psnr = calculate_psnr( 234 | variant, 235 | channels, 236 | width, 237 | height, 238 | &original_data, 239 | &reference_block_data, 240 | ); 241 | 242 | print_metrics(image_name, &psnr); 243 | print_metrics(image_name, &reference_psnr); 244 | 245 | const DIFFERENCE: f64 = 0.0025; 246 | 247 | if f64::abs(reference_psnr.overall_psnr - psnr.overall_psnr) > DIFFERENCE { 248 | panic!( 249 | "Significant overall PSNR difference for image `{image_name}`: {:.3} != {:.3}", 250 | reference_psnr.overall_psnr, psnr.overall_psnr 251 | ); 252 | } 253 | } 254 | 255 | #[test] 256 | fn psnr_bc1() { 257 | compare_psnr(BRICK_FILE_PATH, CompressionVariant::BC1, 3); 258 | compare_psnr(MARBLE_FILE_PATH, CompressionVariant::BC1, 3); 259 | compare_psnr(BLENDER_FILE_PATH, CompressionVariant::BC1, 3); 260 | } 261 | 262 | #[test] 263 | fn psnr_bc3() { 264 | compare_psnr(BRICK_ALPHA_FILE_PATH, CompressionVariant::BC3, 4); 265 | compare_psnr(MARBLE_ALPHA_FILE_PATH, CompressionVariant::BC3, 4); 266 | } 267 | 268 | #[test] 269 | fn psnr_bc6h_very_fast() { 270 | compare_psnr( 271 | BRICK_FILE_PATH, 272 | CompressionVariant::BC6H(BC6HSettings::very_fast()), 273 | 3, 274 | ); 275 | compare_psnr( 276 | MARBLE_FILE_PATH, 277 | CompressionVariant::BC6H(BC6HSettings::very_fast()), 278 | 3, 279 | ); 280 | compare_psnr( 281 | BLENDER_FILE_PATH, 282 | CompressionVariant::BC6H(BC6HSettings::very_fast()), 283 | 3, 284 | ); 285 | } 286 | 287 | #[test] 288 | fn psnr_bc6h_fast() { 289 | compare_psnr( 290 | BRICK_FILE_PATH, 291 | CompressionVariant::BC6H(BC6HSettings::fast()), 292 | 3, 293 | ); 294 | compare_psnr( 295 | MARBLE_FILE_PATH, 296 | CompressionVariant::BC6H(BC6HSettings::fast()), 297 | 3, 298 | ); 299 | compare_psnr( 300 | BLENDER_FILE_PATH, 301 | CompressionVariant::BC6H(BC6HSettings::fast()), 302 | 3, 303 | ); 304 | } 305 | 306 | #[test] 307 | fn psnr_bc6h_basic() { 308 | compare_psnr( 309 | BRICK_FILE_PATH, 310 | CompressionVariant::BC6H(BC6HSettings::basic()), 311 | 3, 312 | ); 313 | compare_psnr( 314 | MARBLE_FILE_PATH, 315 | CompressionVariant::BC6H(BC6HSettings::basic()), 316 | 3, 317 | ); 318 | compare_psnr( 319 | BLENDER_FILE_PATH, 320 | CompressionVariant::BC6H(BC6HSettings::basic()), 321 | 3, 322 | ); 323 | } 324 | 325 | #[test] 326 | fn psnr_bc6h_slow() { 327 | compare_psnr( 328 | BRICK_FILE_PATH, 329 | CompressionVariant::BC6H(BC6HSettings::slow()), 330 | 3, 331 | ); 332 | compare_psnr( 333 | MARBLE_FILE_PATH, 334 | CompressionVariant::BC6H(BC6HSettings::slow()), 335 | 3, 336 | ); 337 | compare_psnr( 338 | BLENDER_FILE_PATH, 339 | CompressionVariant::BC6H(BC6HSettings::slow()), 340 | 3, 341 | ); 342 | } 343 | 344 | #[test] 345 | fn psnr_bc6h_very_slow() { 346 | compare_psnr( 347 | BRICK_FILE_PATH, 348 | CompressionVariant::BC6H(BC6HSettings::very_slow()), 349 | 3, 350 | ); 351 | compare_psnr( 352 | MARBLE_FILE_PATH, 353 | CompressionVariant::BC6H(BC6HSettings::very_slow()), 354 | 3, 355 | ); 356 | compare_psnr( 357 | BLENDER_FILE_PATH, 358 | CompressionVariant::BC6H(BC6HSettings::very_slow()), 359 | 3, 360 | ); 361 | } 362 | 363 | #[test] 364 | fn psnr_bc7_alpha_ultra_fast() { 365 | compare_psnr( 366 | BRICK_ALPHA_FILE_PATH, 367 | CompressionVariant::BC7(BC7Settings::alpha_ultrafast()), 368 | 4, 369 | ); 370 | compare_psnr( 371 | MARBLE_ALPHA_FILE_PATH, 372 | CompressionVariant::BC7(BC7Settings::alpha_ultrafast()), 373 | 4, 374 | ); 375 | } 376 | 377 | #[test] 378 | fn psnr_bc7_alpha_very_fast() { 379 | compare_psnr( 380 | BRICK_ALPHA_FILE_PATH, 381 | CompressionVariant::BC7(BC7Settings::alpha_very_fast()), 382 | 4, 383 | ); 384 | compare_psnr( 385 | MARBLE_ALPHA_FILE_PATH, 386 | CompressionVariant::BC7(BC7Settings::alpha_very_fast()), 387 | 4, 388 | ); 389 | } 390 | 391 | #[test] 392 | fn psnr_bc7_alpha_fast() { 393 | compare_psnr( 394 | BRICK_ALPHA_FILE_PATH, 395 | CompressionVariant::BC7(BC7Settings::alpha_fast()), 396 | 4, 397 | ); 398 | compare_psnr( 399 | MARBLE_ALPHA_FILE_PATH, 400 | CompressionVariant::BC7(BC7Settings::alpha_fast()), 401 | 4, 402 | ); 403 | } 404 | 405 | #[test] 406 | fn psnr_bc7_alpha_basic() { 407 | compare_psnr( 408 | BRICK_ALPHA_FILE_PATH, 409 | CompressionVariant::BC7(BC7Settings::alpha_basic()), 410 | 4, 411 | ); 412 | compare_psnr( 413 | MARBLE_ALPHA_FILE_PATH, 414 | CompressionVariant::BC7(BC7Settings::alpha_basic()), 415 | 4, 416 | ); 417 | } 418 | 419 | #[test] 420 | fn psnr_bc7_alpha_slow() { 421 | compare_psnr( 422 | BRICK_ALPHA_FILE_PATH, 423 | CompressionVariant::BC7(BC7Settings::alpha_slow()), 424 | 4, 425 | ); 426 | compare_psnr( 427 | MARBLE_ALPHA_FILE_PATH, 428 | CompressionVariant::BC7(BC7Settings::alpha_slow()), 429 | 4, 430 | ); 431 | } 432 | 433 | #[test] 434 | fn psnr_bc7_opaque_ultra_fast() { 435 | compare_psnr( 436 | BRICK_FILE_PATH, 437 | CompressionVariant::BC7(BC7Settings::opaque_ultra_fast()), 438 | 3, 439 | ); 440 | compare_psnr( 441 | MARBLE_FILE_PATH, 442 | CompressionVariant::BC7(BC7Settings::opaque_ultra_fast()), 443 | 3, 444 | ); 445 | compare_psnr( 446 | BLENDER_FILE_PATH, 447 | CompressionVariant::BC7(BC7Settings::opaque_ultra_fast()), 448 | 3, 449 | ); 450 | } 451 | 452 | #[test] 453 | fn psnr_bc7_opaque_very_fast() { 454 | compare_psnr( 455 | BRICK_FILE_PATH, 456 | CompressionVariant::BC7(BC7Settings::opaque_very_fast()), 457 | 3, 458 | ); 459 | compare_psnr( 460 | MARBLE_FILE_PATH, 461 | CompressionVariant::BC7(BC7Settings::opaque_very_fast()), 462 | 3, 463 | ); 464 | compare_psnr( 465 | BLENDER_FILE_PATH, 466 | CompressionVariant::BC7(BC7Settings::opaque_very_fast()), 467 | 3, 468 | ); 469 | } 470 | 471 | #[test] 472 | fn psnr_bc7_opaque_fast() { 473 | compare_psnr( 474 | BRICK_FILE_PATH, 475 | CompressionVariant::BC7(BC7Settings::opaque_fast()), 476 | 3, 477 | ); 478 | compare_psnr( 479 | MARBLE_FILE_PATH, 480 | CompressionVariant::BC7(BC7Settings::opaque_fast()), 481 | 3, 482 | ); 483 | compare_psnr( 484 | BLENDER_FILE_PATH, 485 | CompressionVariant::BC7(BC7Settings::opaque_fast()), 486 | 3, 487 | ); 488 | } 489 | 490 | #[test] 491 | fn psnr_bc7_opaque_basic() { 492 | compare_psnr( 493 | BRICK_FILE_PATH, 494 | CompressionVariant::BC7(BC7Settings::opaque_basic()), 495 | 3, 496 | ); 497 | compare_psnr( 498 | MARBLE_FILE_PATH, 499 | CompressionVariant::BC7(BC7Settings::opaque_basic()), 500 | 3, 501 | ); 502 | compare_psnr( 503 | BLENDER_FILE_PATH, 504 | CompressionVariant::BC7(BC7Settings::opaque_basic()), 505 | 3, 506 | ); 507 | } 508 | 509 | #[test] 510 | fn psnr_bc7_opaque_slow() { 511 | compare_psnr( 512 | BRICK_FILE_PATH, 513 | CompressionVariant::BC7(BC7Settings::opaque_slow()), 514 | 3, 515 | ); 516 | compare_psnr( 517 | MARBLE_FILE_PATH, 518 | CompressionVariant::BC7(BC7Settings::opaque_slow()), 519 | 3, 520 | ); 521 | compare_psnr( 522 | BLENDER_FILE_PATH, 523 | CompressionVariant::BC7(BC7Settings::opaque_slow()), 524 | 3, 525 | ); 526 | } 527 | -------------------------------------------------------------------------------- /src/encode.rs: -------------------------------------------------------------------------------- 1 | //! CPU based encoding. 2 | 3 | #[cfg(feature = "bc15")] 4 | mod bc1_to_5; 5 | #[cfg(feature = "bc6h")] 6 | mod bc6h; 7 | #[cfg(feature = "bc7")] 8 | mod bc7; 9 | #[cfg(any(feature = "bc6h", feature = "bc7"))] 10 | mod common; 11 | 12 | #[cfg(feature = "bc15")] 13 | use self::bc1_to_5::BlockCompressorBC15; 14 | #[cfg(feature = "bc6h")] 15 | use self::bc6h::BlockCompressorBC6H; 16 | #[cfg(feature = "bc7")] 17 | use self::bc7::BlockCompressorBC7; 18 | #[cfg(feature = "bc6h")] 19 | use crate::BC6HSettings; 20 | #[cfg(feature = "bc7")] 21 | use crate::BC7Settings; 22 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 23 | use crate::CompressionVariant; 24 | 25 | /// Compresses raw RGBA8 data into using a texture block compression format. 26 | /// 27 | /// It supports BC1 through BC7 compression formats and provides CPU-based texture compression 28 | /// for RGBA8 data. 29 | /// 30 | /// # Data Layout Requirements 31 | /// The input data must be in RGBA8 format (8 bits per channel, 32 bits per pixel). The data is 32 | /// expected to be in row-major order, with optional stride for padding between rows. 33 | /// 34 | /// # Buffer Requirements 35 | /// The destination buffer must have sufficient capacity to store the compressed blocks. 36 | /// The required size can be calculated using [`CompressionVariant::blocks_byte_size()`]. 37 | /// 38 | /// For example: 39 | /// ```ignore 40 | /// let required_size = variant.blocks_byte_size(width, height); 41 | /// assert!(blocks_buffer.len() >= required_size); 42 | /// ``` 43 | /// 44 | /// # Arguments 45 | /// * `variation` - The block compression format to use 46 | /// * `rgba_data` - Source RGBA8 pixel data 47 | /// * `blocks_buffer` - Destination buffer for the compressed blocks 48 | /// * `width` - Width of the image in pixels 49 | /// * `height` - Height of the image in pixels 50 | /// * `stride` - Number of bytes per row in the source data (for padding). 51 | /// Must be `width * 4` for tightly packed RGBA data. 52 | /// 53 | /// # Panics 54 | /// * If `width` or `height` is not a multiple of 4 55 | /// * If the destination `blocks_buffer` is too small to hold the compressed data 56 | /// 57 | /// # Example 58 | /// ``` 59 | /// use block_compression::{encode::compress_rgba8, CompressionVariant}; 60 | /// 61 | /// let rgba_data = vec![0u8; 256 * 256 * 4]; // Your RGBA data 62 | /// let width = 256; 63 | /// let height = 256; 64 | /// let stride = width * 4; // Tightly packed rows 65 | /// let variant = CompressionVariant::BC1; 66 | /// 67 | /// let mut blocks_buffer = vec![0u8; variant.blocks_byte_size(width, height)]; 68 | /// 69 | /// compress_rgba8( 70 | /// variant, 71 | /// &rgba_data, 72 | /// &mut blocks_buffer, 73 | /// width, 74 | /// height, 75 | /// stride, 76 | /// ); 77 | /// ``` 78 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 79 | #[cfg_attr( 80 | docsrs, 81 | doc(cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))) 82 | )] 83 | pub fn compress_rgba8( 84 | variation: CompressionVariant, 85 | rgba_data: &[u8], 86 | blocks_buffer: &mut [u8], 87 | width: u32, 88 | height: u32, 89 | stride: u32, 90 | ) { 91 | assert_eq!(height % 4, 0); 92 | assert_eq!(width % 4, 0); 93 | 94 | let required_size = variation.blocks_byte_size(width, height); 95 | 96 | assert!( 97 | blocks_buffer.len() >= required_size, 98 | "blocks_buffer size ({}) is too small to hold compressed blocks. Required size: {}", 99 | blocks_buffer.len(), 100 | required_size 101 | ); 102 | 103 | let stride = stride as usize; 104 | let block_width = (width as usize).div_ceil(4); 105 | let block_height = (height as usize).div_ceil(4); 106 | 107 | match variation { 108 | #[cfg(feature = "bc15")] 109 | CompressionVariant::BC1 => { 110 | compress_bc1(rgba_data, blocks_buffer, block_width, block_height, stride); 111 | } 112 | #[cfg(feature = "bc15")] 113 | CompressionVariant::BC2 => { 114 | compress_bc2(rgba_data, blocks_buffer, block_width, block_height, stride); 115 | } 116 | #[cfg(feature = "bc15")] 117 | CompressionVariant::BC3 => { 118 | compress_bc3(rgba_data, blocks_buffer, block_width, block_height, stride); 119 | } 120 | #[cfg(feature = "bc15")] 121 | CompressionVariant::BC4 => { 122 | compress_bc4(rgba_data, blocks_buffer, block_width, block_height, stride); 123 | } 124 | #[cfg(feature = "bc15")] 125 | CompressionVariant::BC5 => { 126 | compress_bc5(rgba_data, blocks_buffer, block_width, block_height, stride); 127 | } 128 | #[cfg(feature = "bc6h")] 129 | CompressionVariant::BC6H(settings) => { 130 | compress_bc6h_8bit( 131 | rgba_data, 132 | blocks_buffer, 133 | block_width, 134 | block_height, 135 | stride, 136 | &settings, 137 | ); 138 | } 139 | #[cfg(feature = "bc7")] 140 | CompressionVariant::BC7(settings) => { 141 | compress_bc7( 142 | rgba_data, 143 | blocks_buffer, 144 | block_width, 145 | block_height, 146 | stride, 147 | &settings, 148 | ); 149 | } 150 | } 151 | } 152 | 153 | /// Compresses raw RGBA16 (half-float) data using the BC6H texture block compression format. 154 | /// 155 | /// It supports only BC6H compression format and provides CPU-based texture compression 156 | /// for RGBA16 (half-float) data. 157 | /// 158 | /// # Data Layout Requirements 159 | /// The input data must be in RGBA16 format (16 bits per channel using half-float). The data is 160 | /// expected to be in row-major order, with optional stride for padding between rows. 161 | /// 162 | /// # Buffer Requirements 163 | /// The destination buffer must have sufficient capacity to store the compressed blocks. 164 | /// The required size can be calculated using [`CompressionVariant::blocks_byte_size()`]. 165 | /// 166 | /// For example: 167 | /// ```ignore 168 | /// let required_size = variant.blocks_byte_size(width, height); 169 | /// assert!(blocks_buffer.len() >= required_size); 170 | /// ``` 171 | /// 172 | /// # Arguments 173 | /// * `variation` - The block compression format to use (must be BC6H) 174 | /// * `rgb_data` - Source RGBA16 pixel data in half-float format 175 | /// * `blocks_buffer` - Destination buffer for the compressed blocks 176 | /// * `width` - Width of the image in pixels 177 | /// * `height` - Height of the image in pixels 178 | /// * `stride` - Number of half-float elements per row in the source data (for padding). 179 | /// Must be `width * 4` for tightly packed RGBA data. 180 | /// 181 | /// # Panics 182 | /// * If `width` or `height` is not a multiple of 4 183 | /// * If the destination `blocks_buffer` is too small to hold the compressed data 184 | /// * If `variation` is not `CompressionVariant::BC6H` 185 | /// 186 | /// # Example 187 | /// ``` 188 | /// use block_compression::{encode::compress_rgba16, BC6HSettings, CompressionVariant}; 189 | /// use half::f16; 190 | /// 191 | /// let rgba_data = vec![f16::ZERO; 256 * 256 * 4]; // Your RGBA16 data 192 | /// let width = 256; 193 | /// let height = 256; 194 | /// let stride = width * 4; // Tightly packed rows 195 | /// let settings = BC6HSettings::very_slow(); 196 | /// let variant = CompressionVariant::BC6H(settings); 197 | /// 198 | /// let mut blocks_buffer = vec![0u8; variant.blocks_byte_size(width, height)]; 199 | /// 200 | /// compress_rgba16( 201 | /// variant, 202 | /// &rgba_data, 203 | /// &mut blocks_buffer, 204 | /// width, 205 | /// height, 206 | /// stride, 207 | /// ); 208 | /// ``` 209 | #[cfg(feature = "bc6h")] 210 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 211 | pub fn compress_rgba16( 212 | variation: CompressionVariant, 213 | rgba_data: &[half::f16], 214 | blocks_buffer: &mut [u8], 215 | width: u32, 216 | height: u32, 217 | stride: u32, 218 | ) { 219 | assert_eq!(height % 4, 0); 220 | assert_eq!(width % 4, 0); 221 | 222 | let required_size = variation.blocks_byte_size(width, height); 223 | 224 | assert!( 225 | blocks_buffer.len() >= required_size, 226 | "blocks_buffer size ({}) is too small to hold compressed blocks. Required size: {}", 227 | blocks_buffer.len(), 228 | required_size 229 | ); 230 | 231 | let stride = stride as usize; 232 | let block_width = (width as usize).div_ceil(4); 233 | let block_height = (height as usize).div_ceil(4); 234 | 235 | match variation { 236 | CompressionVariant::BC6H(settings) => { 237 | compress_bc6h_16bit( 238 | rgba_data, 239 | blocks_buffer, 240 | block_width, 241 | block_height, 242 | stride, 243 | &settings, 244 | ); 245 | } 246 | #[allow(unreachable_patterns)] 247 | _ => { 248 | panic!("only BC6H is supported for calling compress_rgba16"); 249 | } 250 | } 251 | } 252 | 253 | #[cfg(feature = "bc15")] 254 | fn compress_bc1( 255 | rgba_data: &[u8], 256 | blocks_buffer: &mut [u8], 257 | block_width: usize, 258 | block_height: usize, 259 | stride: usize, 260 | ) { 261 | for yy in 0..block_height { 262 | for xx in 0..block_width { 263 | let mut block_compressor = BlockCompressorBC15::default(); 264 | 265 | block_compressor.load_block_interleaved_rgba(rgba_data, xx, yy, stride); 266 | let color_result = block_compressor.compress_block_bc1_core(); 267 | block_compressor.store_data(blocks_buffer, block_width, xx, yy, &color_result); 268 | } 269 | } 270 | } 271 | 272 | #[cfg(feature = "bc15")] 273 | fn compress_bc2( 274 | rgba_data: &[u8], 275 | blocks_buffer: &mut [u8], 276 | block_width: usize, 277 | block_height: usize, 278 | stride: usize, 279 | ) { 280 | for yy in 0..block_height { 281 | for xx in 0..block_width { 282 | let mut block_compressor = BlockCompressorBC15::default(); 283 | let mut compressed_data = [0; 4]; 284 | 285 | let alpha_result = block_compressor.load_block_alpha_4bit(rgba_data, xx, yy, stride); 286 | 287 | compressed_data[0] = alpha_result[0]; 288 | compressed_data[1] = alpha_result[1]; 289 | 290 | block_compressor.load_block_interleaved_rgba(rgba_data, xx, yy, stride); 291 | 292 | let color_result = block_compressor.compress_block_bc1_core(); 293 | compressed_data[2] = color_result[0]; 294 | compressed_data[3] = color_result[1]; 295 | 296 | block_compressor.store_data(blocks_buffer, block_width, xx, yy, &compressed_data); 297 | } 298 | } 299 | } 300 | 301 | #[cfg(feature = "bc15")] 302 | fn compress_bc3( 303 | rgba_data: &[u8], 304 | blocks_buffer: &mut [u8], 305 | block_width: usize, 306 | block_height: usize, 307 | stride: usize, 308 | ) { 309 | for yy in 0..block_height { 310 | for xx in 0..block_width { 311 | let mut block_compressor = BlockCompressorBC15::default(); 312 | 313 | let mut compressed_data = [0; 4]; 314 | 315 | block_compressor.load_block_interleaved_rgba(rgba_data, xx, yy, stride); 316 | 317 | let alpha_result = block_compressor.compress_block_bc3_alpha(); 318 | compressed_data[0] = alpha_result[0]; 319 | compressed_data[1] = alpha_result[1]; 320 | 321 | let color_result = block_compressor.compress_block_bc1_core(); 322 | compressed_data[2] = color_result[0]; 323 | compressed_data[3] = color_result[1]; 324 | 325 | block_compressor.store_data(blocks_buffer, block_width, xx, yy, &compressed_data); 326 | } 327 | } 328 | } 329 | 330 | #[cfg(feature = "bc15")] 331 | fn compress_bc4( 332 | rgba_data: &[u8], 333 | blocks_buffer: &mut [u8], 334 | block_width: usize, 335 | block_height: usize, 336 | stride: usize, 337 | ) { 338 | for yy in 0..block_height { 339 | for xx in 0..block_width { 340 | let mut block_compressor = BlockCompressorBC15::default(); 341 | 342 | let mut compressed_data = [0; 2]; 343 | 344 | block_compressor.load_block_r_8bit(rgba_data, xx, yy, stride); 345 | 346 | let color_result = block_compressor.compress_block_bc3_alpha(); 347 | compressed_data[0] = color_result[0]; 348 | compressed_data[1] = color_result[1]; 349 | 350 | block_compressor.store_data(blocks_buffer, block_width, xx, yy, &compressed_data); 351 | } 352 | } 353 | } 354 | 355 | #[cfg(feature = "bc15")] 356 | fn compress_bc5( 357 | rgba_data: &[u8], 358 | blocks_buffer: &mut [u8], 359 | block_width: usize, 360 | block_height: usize, 361 | stride: usize, 362 | ) { 363 | for yy in 0..block_height { 364 | for xx in 0..block_width { 365 | let mut block_compressor = BlockCompressorBC15::default(); 366 | 367 | let mut compressed_data = [0; 4]; 368 | 369 | block_compressor.load_block_r_8bit(rgba_data, xx, yy, stride); 370 | 371 | let red_result = block_compressor.compress_block_bc3_alpha(); 372 | compressed_data[0] = red_result[0]; 373 | compressed_data[1] = red_result[1]; 374 | 375 | block_compressor.load_block_g_8bit(rgba_data, xx, yy, stride); 376 | 377 | let green_result = block_compressor.compress_block_bc3_alpha(); 378 | compressed_data[2] = green_result[0]; 379 | compressed_data[3] = green_result[1]; 380 | 381 | block_compressor.store_data(blocks_buffer, block_width, xx, yy, &compressed_data); 382 | } 383 | } 384 | } 385 | 386 | #[cfg(feature = "bc6h")] 387 | fn compress_bc6h_8bit( 388 | rgba_data: &[u8], 389 | blocks_buffer: &mut [u8], 390 | block_width: usize, 391 | block_height: usize, 392 | stride: usize, 393 | settings: &BC6HSettings, 394 | ) { 395 | for yy in 0..block_height { 396 | for xx in 0..block_width { 397 | let mut block_compressor = BlockCompressorBC6H::new(settings); 398 | block_compressor.load_block_interleaved_8bit(rgba_data, xx, yy, stride); 399 | block_compressor.compress_bc6h_core(); 400 | block_compressor.store_data(blocks_buffer, block_width, xx, yy); 401 | } 402 | } 403 | } 404 | 405 | #[cfg(feature = "bc6h")] 406 | fn compress_bc6h_16bit( 407 | rgba_data: &[half::f16], 408 | blocks_buffer: &mut [u8], 409 | block_width: usize, 410 | block_height: usize, 411 | stride: usize, 412 | settings: &BC6HSettings, 413 | ) { 414 | for yy in 0..block_height { 415 | for xx in 0..block_width { 416 | let mut block_compressor = BlockCompressorBC6H::new(settings); 417 | block_compressor.load_block_interleaved_16bit(rgba_data, xx, yy, stride); 418 | block_compressor.compress_bc6h_core(); 419 | block_compressor.store_data(blocks_buffer, block_width, xx, yy); 420 | } 421 | } 422 | } 423 | 424 | #[cfg(feature = "bc7")] 425 | fn compress_bc7( 426 | rgba_data: &[u8], 427 | blocks_buffer: &mut [u8], 428 | block_width: usize, 429 | block_height: usize, 430 | stride: usize, 431 | settings: &BC7Settings, 432 | ) { 433 | for yy in 0..block_height { 434 | for xx in 0..block_width { 435 | let mut block_compressor = BlockCompressorBC7::new(settings); 436 | 437 | block_compressor.load_block_interleaved_rgba(rgba_data, xx, yy, stride); 438 | block_compressor.compute_opaque_err(); 439 | block_compressor.compress_block_bc7_core(); 440 | block_compressor.store_data(blocks_buffer, block_width, xx, yy); 441 | } 442 | } 443 | } 444 | -------------------------------------------------------------------------------- /src/decode.rs: -------------------------------------------------------------------------------- 1 | //! CPU based decoding. 2 | 3 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 4 | mod block; 5 | 6 | #[cfg(feature = "bc7")] 7 | #[cfg_attr(docsrs, doc(cfg(feature = "bc7")))] 8 | pub use self::block::decode_block_bc7; 9 | #[cfg(feature = "bc15")] 10 | #[cfg_attr(docsrs, doc(cfg(feature = "bc15")))] 11 | pub use self::block::{ 12 | decode_block_bc1, decode_block_bc2, decode_block_bc3, decode_block_bc4, decode_block_bc5, 13 | }; 14 | #[cfg(feature = "bc6h")] 15 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 16 | pub use self::block::{decode_block_bc6h, decode_block_bc6h_float}; 17 | #[cfg(feature = "bc6h")] 18 | use crate::BC6HSettings; 19 | #[cfg(feature = "bc7")] 20 | use crate::BC7Settings; 21 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 22 | use crate::CompressionVariant; 23 | 24 | /// Trait to decode a BC variant into RGBA8 data. 25 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 26 | trait BlockRgba8Decoder { 27 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize); 28 | fn block_byte_size() -> u32; 29 | } 30 | 31 | /// Trait to decode a BC variant into RGBA16F data. 32 | #[cfg(feature = "bc6h")] 33 | trait BlockRgba16fDecoder { 34 | fn decode_block_rgba16f(compressed: &[u8], decompressed: &mut [half::f16], pitch: usize); 35 | fn block_byte_size() -> u32; 36 | } 37 | 38 | /// Trait to decode a BC variant into RGBA32F data. 39 | #[cfg(feature = "bc6h")] 40 | trait BlockRgba32fDecoder { 41 | fn decode_block_rgba32f(compressed: &[u8], decompressed: &mut [f32], pitch: usize); 42 | fn block_byte_size() -> u32; 43 | } 44 | 45 | #[cfg(feature = "bc15")] 46 | struct BC1Decoder; 47 | #[cfg(feature = "bc15")] 48 | struct BC2Decoder; 49 | #[cfg(feature = "bc15")] 50 | struct BC3Decoder; 51 | #[cfg(feature = "bc15")] 52 | struct BC4Decoder; 53 | #[cfg(feature = "bc15")] 54 | struct BC5Decoder; 55 | #[cfg(feature = "bc6h")] 56 | struct BC6HDecoder; 57 | #[cfg(feature = "bc7")] 58 | struct BC7Decoder; 59 | 60 | #[cfg(feature = "bc15")] 61 | impl BlockRgba8Decoder for BC1Decoder { 62 | #[inline(always)] 63 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 64 | decode_block_bc1(compressed, decompressed, pitch) 65 | } 66 | 67 | fn block_byte_size() -> u32 { 68 | CompressionVariant::BC1.block_byte_size() 69 | } 70 | } 71 | 72 | #[cfg(feature = "bc15")] 73 | impl BlockRgba8Decoder for BC2Decoder { 74 | #[inline(always)] 75 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 76 | decode_block_bc2(compressed, decompressed, pitch) 77 | } 78 | 79 | fn block_byte_size() -> u32 { 80 | CompressionVariant::BC2.block_byte_size() 81 | } 82 | } 83 | 84 | #[cfg(feature = "bc15")] 85 | impl BlockRgba8Decoder for BC3Decoder { 86 | #[inline(always)] 87 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 88 | decode_block_bc3(compressed, decompressed, pitch) 89 | } 90 | 91 | fn block_byte_size() -> u32 { 92 | CompressionVariant::BC3.block_byte_size() 93 | } 94 | } 95 | 96 | #[cfg(feature = "bc15")] 97 | impl BlockRgba8Decoder for BC4Decoder { 98 | #[inline(always)] 99 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 100 | const PITCH: usize = 4; 101 | let mut buffer = [0u8; 16]; 102 | decode_block_bc4(compressed, &mut buffer, 4); 103 | 104 | // Convert R8 to RGBA8 105 | for y in 0..4 { 106 | for x in 0..4 { 107 | let out_pos = y * pitch + x * 4; 108 | let in_pos = y * PITCH + x; 109 | 110 | decompressed[out_pos] = buffer[in_pos]; 111 | decompressed[out_pos + 1] = 0; 112 | decompressed[out_pos + 2] = 0; 113 | decompressed[out_pos + 3] = 0; 114 | } 115 | } 116 | } 117 | 118 | fn block_byte_size() -> u32 { 119 | CompressionVariant::BC4.block_byte_size() 120 | } 121 | } 122 | 123 | #[cfg(feature = "bc15")] 124 | impl BlockRgba8Decoder for BC5Decoder { 125 | #[inline(always)] 126 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 127 | const PITCH: usize = 8; 128 | let mut buffer = [0u8; 32]; 129 | decode_block_bc5(compressed, &mut buffer, PITCH); 130 | 131 | // Convert RG8 to RGBA8 132 | for y in 0..4 { 133 | for x in 0..4 { 134 | let out_pos = y * pitch + x * 4; 135 | let in_pos = y * PITCH + x * 2; 136 | 137 | decompressed[out_pos] = buffer[in_pos]; 138 | decompressed[out_pos + 1] = buffer[in_pos + 1]; 139 | decompressed[out_pos + 2] = 0; 140 | decompressed[out_pos + 3] = 0; 141 | } 142 | } 143 | } 144 | 145 | fn block_byte_size() -> u32 { 146 | CompressionVariant::BC5.block_byte_size() 147 | } 148 | } 149 | 150 | #[cfg(feature = "bc6h")] 151 | fn linear_to_srgb(linear: f32) -> u8 { 152 | let v = if linear <= 0.0031308 { 153 | linear * 12.92 154 | } else { 155 | 1.055 * linear.powf(1.0 / 2.4) - 0.055 156 | }; 157 | 158 | (v.clamp(0.0, 1.0) * 255.0).round() as u8 159 | } 160 | 161 | #[cfg(feature = "bc6h")] 162 | impl BlockRgba8Decoder for BC6HDecoder { 163 | #[inline(always)] 164 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 165 | const PITCH: usize = 12; 166 | let mut buffer = [0.0_f32; 48]; 167 | decode_block_bc6h_float(compressed, &mut buffer, PITCH, false); 168 | 169 | // Convert RGB16F to RGBA8 170 | for y in 0..4 { 171 | for x in 0..4 { 172 | let out_pos = y * pitch + x * 4; 173 | let in_pos = y * PITCH + x * 3; 174 | 175 | decompressed[out_pos] = linear_to_srgb(buffer[in_pos]) as _; 176 | decompressed[out_pos + 1] = linear_to_srgb(buffer[in_pos + 1]) as _; 177 | decompressed[out_pos + 2] = linear_to_srgb(buffer[in_pos + 2]) as _; 178 | decompressed[out_pos + 3] = 0; 179 | } 180 | } 181 | } 182 | 183 | fn block_byte_size() -> u32 { 184 | CompressionVariant::BC6H(BC6HSettings::basic()).block_byte_size() 185 | } 186 | } 187 | 188 | #[cfg(feature = "bc7")] 189 | impl BlockRgba8Decoder for BC7Decoder { 190 | #[inline(always)] 191 | fn decode_block_rgba8(compressed: &[u8], decompressed: &mut [u8], pitch: usize) { 192 | decode_block_bc7(compressed, decompressed, pitch) 193 | } 194 | 195 | fn block_byte_size() -> u32 { 196 | CompressionVariant::BC7(BC7Settings::alpha_basic()).block_byte_size() 197 | } 198 | } 199 | 200 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 201 | fn decompress_rgba8( 202 | width: u32, 203 | height: u32, 204 | blocks_data: &[u8], 205 | rgba_data: &mut [u8], 206 | ) { 207 | let blocks_x = width.div_ceil(4); 208 | let blocks_y = height.div_ceil(4); 209 | let block_byte_size = D::block_byte_size() as usize; 210 | let output_row_pitch = width as usize * 4; // Always RGBA 211 | 212 | for by in 0..blocks_y { 213 | for bx in 0..blocks_x { 214 | let block_index = (by * blocks_x + bx) as usize; 215 | let block_offset = block_index * block_byte_size; 216 | 217 | if block_offset + block_byte_size > blocks_data.len() { 218 | break; 219 | } 220 | 221 | let output_offset = (by * 4 * output_row_pitch as u32 + bx * 16) as usize; 222 | 223 | if output_offset < rgba_data.len() { 224 | D::decode_block_rgba8( 225 | &blocks_data[block_offset..block_offset + block_byte_size], 226 | &mut rgba_data[output_offset..], 227 | output_row_pitch, 228 | ); 229 | } 230 | } 231 | } 232 | } 233 | 234 | #[cfg(feature = "bc6h")] 235 | impl BlockRgba16fDecoder for BC6HDecoder { 236 | #[inline(always)] 237 | fn decode_block_rgba16f(compressed: &[u8], decompressed: &mut [half::f16], pitch: usize) { 238 | decode_block_bc6h(compressed, decompressed, pitch, false); 239 | } 240 | 241 | fn block_byte_size() -> u32 { 242 | CompressionVariant::BC6H(BC6HSettings::basic()).block_byte_size() 243 | } 244 | } 245 | 246 | #[cfg(feature = "bc6h")] 247 | fn decompress_rgba16f( 248 | width: u32, 249 | height: u32, 250 | blocks_data: &[u8], 251 | rgba_data: &mut [half::f16], 252 | ) { 253 | let blocks_x = width.div_ceil(4); 254 | let blocks_y = height.div_ceil(4); 255 | let block_byte_size = D::block_byte_size() as usize; 256 | let output_row_pitch = width as usize * 4; // Always RGBA16f 257 | 258 | for by in 0..blocks_y { 259 | for bx in 0..blocks_x { 260 | let block_index = (by * blocks_x + bx) as usize; 261 | let block_offset = block_index * block_byte_size; 262 | 263 | if block_offset + block_byte_size > blocks_data.len() { 264 | break; 265 | } 266 | 267 | let output_offset = (by * 4 * output_row_pitch as u32 + bx * 16) as usize; 268 | 269 | if output_offset < rgba_data.len() { 270 | D::decode_block_rgba16f( 271 | &blocks_data[block_offset..block_offset + block_byte_size], 272 | &mut rgba_data[output_offset..], 273 | output_row_pitch, 274 | ); 275 | } 276 | } 277 | } 278 | } 279 | 280 | #[cfg(feature = "bc6h")] 281 | impl BlockRgba32fDecoder for BC6HDecoder { 282 | #[inline(always)] 283 | fn decode_block_rgba32f(compressed: &[u8], decompressed: &mut [f32], pitch: usize) { 284 | decode_block_bc6h_float(compressed, decompressed, pitch, false); 285 | } 286 | 287 | fn block_byte_size() -> u32 { 288 | CompressionVariant::BC6H(BC6HSettings::basic()).block_byte_size() 289 | } 290 | } 291 | 292 | #[cfg(feature = "bc6h")] 293 | fn decompress_rgba32f( 294 | width: u32, 295 | height: u32, 296 | blocks_data: &[u8], 297 | rgba_data: &mut [f32], 298 | ) { 299 | let blocks_x = width.div_ceil(4); 300 | let blocks_y = height.div_ceil(4); 301 | let block_byte_size = D::block_byte_size() as usize; 302 | let output_row_pitch = width as usize * 4; // Always RGBA32f 303 | 304 | for by in 0..blocks_y { 305 | for bx in 0..blocks_x { 306 | let block_index = (by * blocks_x + bx) as usize; 307 | let block_offset = block_index * block_byte_size; 308 | 309 | if block_offset + block_byte_size > blocks_data.len() { 310 | break; 311 | } 312 | 313 | let output_offset = (by * 4 * output_row_pitch as u32 + bx * 16) as usize; 314 | 315 | if output_offset < rgba_data.len() { 316 | D::decode_block_rgba32f( 317 | &blocks_data[block_offset..block_offset + block_byte_size], 318 | &mut rgba_data[output_offset..], 319 | output_row_pitch, 320 | ); 321 | } 322 | } 323 | } 324 | } 325 | 326 | /// Helper function to easily decompress block data into RGBA8 data. 327 | /// 328 | /// # Panics 329 | /// - The `blocks_data` has not the expected size (`variant.blocks_byte_size()`) 330 | /// - The `rgba_data` has not the expected size (`width * height * 4`) 331 | #[cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))] 332 | #[cfg_attr( 333 | docsrs, 334 | doc(cfg(any(feature = "bc15", feature = "bc6h", feature = "bc7"))) 335 | )] 336 | pub fn decompress_blocks_as_rgba8( 337 | variant: CompressionVariant, 338 | width: u32, 339 | height: u32, 340 | blocks_data: &[u8], 341 | rgba_data: &mut [u8], 342 | ) { 343 | let expected_input_size = variant.blocks_byte_size(width, height); 344 | assert_eq!( 345 | blocks_data.len(), 346 | expected_input_size, 347 | "the input bitstream slice has not the expected size" 348 | ); 349 | 350 | let expected_output_size = width as usize * height as usize * 4; 351 | assert_eq!( 352 | rgba_data.len(), 353 | expected_output_size, 354 | "the output slice has not the expected size" 355 | ); 356 | 357 | match variant { 358 | #[cfg(feature = "bc15")] 359 | CompressionVariant::BC1 => { 360 | decompress_rgba8::(width, height, blocks_data, rgba_data) 361 | } 362 | #[cfg(feature = "bc15")] 363 | CompressionVariant::BC2 => { 364 | decompress_rgba8::(width, height, blocks_data, rgba_data) 365 | } 366 | #[cfg(feature = "bc15")] 367 | CompressionVariant::BC3 => { 368 | decompress_rgba8::(width, height, blocks_data, rgba_data) 369 | } 370 | #[cfg(feature = "bc15")] 371 | CompressionVariant::BC4 => { 372 | decompress_rgba8::(width, height, blocks_data, rgba_data) 373 | } 374 | #[cfg(feature = "bc15")] 375 | CompressionVariant::BC5 => { 376 | decompress_rgba8::(width, height, blocks_data, rgba_data) 377 | } 378 | #[cfg(feature = "bc6h")] 379 | CompressionVariant::BC6H(..) => { 380 | decompress_rgba8::(width, height, blocks_data, rgba_data) 381 | } 382 | #[cfg(feature = "bc7")] 383 | CompressionVariant::BC7(..) => { 384 | decompress_rgba8::(width, height, blocks_data, rgba_data) 385 | } 386 | } 387 | } 388 | 389 | /// Helper function to easily decompress block data into RGBA16F data. Only BCH6 is currently supported. 390 | /// 391 | /// # Panics 392 | /// - The `blocks_data` has not the expected size (`variant.blocks_byte_size()`) 393 | /// - The `rgba_data` has not the expected size (`width * height * 4`) 394 | /// - If `variant` is any other value than BC6H. 395 | #[cfg(feature = "bc6h")] 396 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 397 | pub fn decompress_blocks_as_rgba16f( 398 | variant: CompressionVariant, 399 | width: u32, 400 | height: u32, 401 | blocks_data: &[u8], 402 | rgba_data: &mut [half::f16], 403 | ) { 404 | let expected_input_size = variant.blocks_byte_size(width, height); 405 | 406 | assert_eq!( 407 | blocks_data.len(), 408 | expected_input_size, 409 | "the input bitstream slice has not the expected size" 410 | ); 411 | 412 | let expected_output_size = width as usize * height as usize * 4; 413 | assert_eq!( 414 | rgba_data.len(), 415 | expected_output_size, 416 | "the output slice has not the expected size" 417 | ); 418 | 419 | match variant { 420 | CompressionVariant::BC6H(..) => { 421 | decompress_rgba16f::(width, height, blocks_data, rgba_data) 422 | } 423 | #[allow(unreachable_patterns)] 424 | _ => { 425 | panic!("unsupported compression variant"); 426 | } 427 | } 428 | } 429 | 430 | /// Helper function to easily decompress block data into RGBA32F data. Only BCH6 is currently supported. 431 | /// 432 | /// # Panics 433 | /// - The `blocks_data` has not the expected size (`variant.blocks_byte_size()`) 434 | /// - The `rgba_data` has not the expected size (`width * height * 4`) 435 | /// - If `variant` is any other value than BC6H. 436 | #[cfg(feature = "bc6h")] 437 | #[cfg_attr(docsrs, doc(cfg(feature = "bc6h")))] 438 | pub fn decompress_blocks_as_rgba32f( 439 | variant: CompressionVariant, 440 | width: u32, 441 | height: u32, 442 | blocks_data: &[u8], 443 | rgba_data: &mut [f32], 444 | ) { 445 | let expected_input_size = variant.blocks_byte_size(width, height); 446 | assert_eq!( 447 | blocks_data.len(), 448 | expected_input_size, 449 | "the input bitstream slice has not the expected size" 450 | ); 451 | 452 | let expected_output_size = width as usize * height as usize * 4; 453 | assert_eq!( 454 | rgba_data.len(), 455 | expected_output_size, 456 | "the output slice has not the expected size" 457 | ); 458 | 459 | match variant { 460 | CompressionVariant::BC6H(..) => { 461 | decompress_rgba32f::(width, height, blocks_data, rgba_data) 462 | } 463 | #[allow(unreachable_patterns)] 464 | _ => { 465 | panic!("unsupported compression variant"); 466 | } 467 | } 468 | } 469 | -------------------------------------------------------------------------------- /src/shader/bc1_to_5.wgsl: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025, Nils Hasenbanck 2 | // Copyright (c) 2016-2024, Intel Corporation 3 | // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | // documentation files (the "Software"), to deal in the Software without restriction, including without limitation 6 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | // permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | // 9 | // The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | // the Software. 11 | // 12 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 15 | // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 16 | // SOFTWARE. 17 | 18 | struct Uniforms { 19 | width: u32, 20 | height: u32, 21 | texture_y_offset: u32, 22 | blocks_offset: u32, 23 | } 24 | 25 | @group(0) @binding(0) var source_texture: texture_2d; 26 | @group(0) @binding(1) var block_buffer: array; 27 | @group(0) @binding(2) var uniforms: Uniforms; 28 | 29 | fn sq(x: f32) -> f32 { 30 | return x * x; 31 | } 32 | 33 | fn rsqrt(x: f32) -> f32 { 34 | return 1.0 / sqrt(x); 35 | } 36 | 37 | fn rcp(x: f32) -> f32 { 38 | return 1.0 / x; 39 | } 40 | 41 | fn load_block_interleaved_rgba(block: ptr>, xx: u32, yy: u32) { 42 | for (var y = 0u; y < 4u; y++) { 43 | for (var x = 0u; x < 4u; x++) { 44 | let pixel_x = xx * 4u + x; 45 | let pixel_y = yy * 4u + y; 46 | let rgba = textureLoad(source_texture, vec2(pixel_x, pixel_y), 0); 47 | 48 | (*block)[16u * 0u + y * 4u + x] = rgba.r * 255.0; 49 | (*block)[16u * 1u + y * 4u + x] = rgba.g * 255.0; 50 | (*block)[16u * 2u + y * 4u + x] = rgba.b * 255.0; 51 | (*block)[16u * 3u + y * 4u + x] = rgba.a * 255.0; 52 | } 53 | } 54 | } 55 | 56 | fn load_block_r_8bit(block: ptr>, xx: u32, yy: u32) { 57 | for (var y = 0u; y < 4u; y++) { 58 | for (var x = 0u; x < 4u; x++) { 59 | let pixel_x = xx * 4u + x; 60 | let pixel_y = yy * 4u + y; 61 | let red = textureLoad(source_texture, vec2(pixel_x, pixel_y), 0).r; 62 | 63 | (*block)[48u + y * 4u + x] = red * 255.0; 64 | } 65 | } 66 | } 67 | 68 | fn load_block_g_8bit(block: ptr>, xx: u32, yy: u32) { 69 | for (var y = 0u; y < 4u; y++) { 70 | for (var x = 0u; x < 4u; x++) { 71 | let pixel_x = xx * 4u + x; 72 | let pixel_y = yy * 4u + y; 73 | let green = textureLoad(source_texture, vec2(pixel_x, pixel_y), 0).g; 74 | 75 | (*block)[48u + y * 4u + x] = green * 255.0; 76 | } 77 | } 78 | } 79 | 80 | fn load_block_alpha_4bit(xx: u32, yy: u32) -> vec2 { 81 | var alpha_bits: vec2; 82 | 83 | for (var y = 0u; y < 4u; y++) { 84 | for (var x = 0u; x < 4u; x++) { 85 | let pixel_x = xx * 4u + x; 86 | let pixel_y = yy * 4u + y + uniforms.texture_y_offset;; 87 | let alpha = textureLoad(source_texture, vec2(pixel_x, pixel_y), 0).a; 88 | 89 | // Convert alpha to 4 bits (0-15) 90 | let alpha4 = u32(alpha * 15.0); 91 | let bit_position = y * 16u + x * 4u; 92 | 93 | if (bit_position < 32u) { 94 | alpha_bits[0] |= (alpha4 << bit_position); 95 | } else { 96 | alpha_bits[1] |= (alpha4 << (bit_position - 32u)); 97 | } 98 | } 99 | } 100 | 101 | return alpha_bits; 102 | } 103 | 104 | fn store_data_2(block_width: u32, xx: u32, yy: u32, data: vec2) { 105 | let offset = uniforms.blocks_offset + (yy * block_width * 2u + xx * 2u); 106 | 107 | block_buffer[offset + 0] = data[0]; 108 | block_buffer[offset + 1] = data[1]; 109 | } 110 | 111 | fn store_data_4(block_width: u32, xx: u32, yy: u32, data: vec4) { 112 | let offset = uniforms.blocks_offset + (yy * block_width * 4u + xx * 4u); 113 | 114 | block_buffer[offset + 0] = data[0]; 115 | block_buffer[offset + 1] = data[1]; 116 | block_buffer[offset + 2] = data[2]; 117 | block_buffer[offset + 3] = data[3]; 118 | } 119 | 120 | fn compute_covar_dc( 121 | covar: ptr>, 122 | dc: ptr>, 123 | block: ptr>, 124 | ) { 125 | for (var p = 0u; p < 3u; p++) { 126 | var acc = 0.0; 127 | for (var k = 0u; k < 16u; k++) { 128 | acc += (*block)[k + p * 16u]; 129 | } 130 | (*dc)[p] = acc / 16.0; 131 | } 132 | 133 | var covar0 = 0.0; 134 | var covar1 = 0.0; 135 | var covar2 = 0.0; 136 | var covar3 = 0.0; 137 | var covar4 = 0.0; 138 | var covar5 = 0.0; 139 | 140 | for (var k = 0u; k < 16u; k++) { 141 | let rgb0 = (*block)[k + 0u * 16u] - (*dc)[0]; 142 | let rgb1 = (*block)[k + 1u * 16u] - (*dc)[1]; 143 | let rgb2 = (*block)[k + 2u * 16u] - (*dc)[2]; 144 | 145 | covar0 += rgb0 * rgb0; 146 | covar1 += rgb0 * rgb1; 147 | covar2 += rgb0 * rgb2; 148 | covar3 += rgb1 * rgb1; 149 | covar4 += rgb1 * rgb2; 150 | covar5 += rgb2 * rgb2; 151 | } 152 | 153 | (*covar)[0] = covar0; 154 | (*covar)[1] = covar1; 155 | (*covar)[2] = covar2; 156 | (*covar)[3] = covar3; 157 | (*covar)[4] = covar4; 158 | (*covar)[5] = covar5; 159 | } 160 | 161 | fn ssymv(result: ptr>, covar: ptr>, a_vector: ptr>) { 162 | (*result)[0] = (*covar)[0] * (*a_vector)[0] + (*covar)[1] * (*a_vector)[1] + (*covar)[2] * (*a_vector)[2]; 163 | (*result)[1] = (*covar)[1] * (*a_vector)[0] + (*covar)[3] * (*a_vector)[1] + (*covar)[4] * (*a_vector)[2]; 164 | (*result)[2] = (*covar)[2] * (*a_vector)[0] + (*covar)[4] * (*a_vector)[1] + (*covar)[5] * (*a_vector)[2]; 165 | } 166 | 167 | fn compute_axis3(axis: ptr>, covar: ptr>, powerIterations: i32) { 168 | var a_vector = vec3(1.0, 1.0, 1.0); 169 | 170 | for (var i = 0; i < powerIterations; i++) { 171 | ssymv(axis, covar, &a_vector); 172 | 173 | for (var p = 0u; p < 3u; p++) { 174 | a_vector[p] = (*axis)[p]; 175 | } 176 | 177 | if (i % 2 == 1) { 178 | var norm_sq = 0.0; 179 | for (var p = 0u; p < 3u; p++) { 180 | norm_sq += (*axis)[p] * (*axis)[p]; 181 | } 182 | 183 | let rnorm = rsqrt(norm_sq); 184 | for (var p = 0u; p < 3u; p++) { 185 | a_vector[p] *= rnorm; 186 | } 187 | } 188 | } 189 | 190 | for (var p = 0u; p < 3u; p++) { 191 | (*axis)[p] = a_vector[p]; 192 | } 193 | } 194 | 195 | fn pick_endpoints( 196 | c0: ptr>, 197 | c1: ptr>, 198 | block: ptr>, 199 | axis: ptr>, 200 | dc: ptr> 201 | ) { 202 | var min_dot = 256.0 * 256.0; 203 | var max_dot = 0.0; 204 | 205 | for (var y = 0u; y < 4u; y++) { 206 | for (var x = 0u; x < 4u; x++) { 207 | var dot = 0.0; 208 | for (var p = 0u; p < 3u; p++) { 209 | dot += ((*block)[p * 16u + y * 4u + x] - (*dc)[p]) * (*axis)[p]; 210 | } 211 | 212 | min_dot = min(min_dot, dot); 213 | max_dot = max(max_dot, dot); 214 | } 215 | } 216 | 217 | if (max_dot - min_dot < 1.0) { 218 | min_dot -= 0.5; 219 | max_dot += 0.5; 220 | } 221 | 222 | var norm_sq = 0.0; 223 | for (var p = 0u; p < 3u; p++) { 224 | norm_sq += (*axis)[p] * (*axis)[p]; 225 | } 226 | 227 | let rnorm_sq = rcp(norm_sq); 228 | for (var p = 0u; p < 3u; p++) { 229 | (*c0)[p] = clamp((*dc)[p] + min_dot * rnorm_sq * (*axis)[p], 0.0, 255.0); 230 | (*c1)[p] = clamp((*dc)[p] + max_dot * rnorm_sq * (*axis)[p], 0.0, 255.0); 231 | } 232 | } 233 | 234 | fn dec_rgb565(c: ptr>, p: i32) { 235 | let b5 = (p >> 0) & 31; 236 | let g6 = (p >> 5) & 63; 237 | let r5 = (p >> 11) & 31; 238 | 239 | (*c)[0] = f32((r5 << 3) + (r5 >> 2)); 240 | (*c)[1] = f32((g6 << 2) + (g6 >> 4)); 241 | (*c)[2] = f32((b5 << 3) + (b5 >> 2)); 242 | } 243 | 244 | fn enc_rgb565(c: ptr>) -> i32 { 245 | let r = i32((*c)[0]); 246 | let g = i32((*c)[1]); 247 | let b = i32((*c)[2]); 248 | 249 | let r5 = (r * 31 + 128 + ((r * 31) >> 8)) >> 8; 250 | let g6 = (g * 63 + 128 + ((g * 63) >> 8)) >> 8; 251 | let b5 = (b * 31 + 128 + ((b * 31) >> 8)) >> 8; 252 | 253 | return (r5 << 11) + (g6 << 5) + b5; 254 | } 255 | 256 | fn fast_quant(block: ptr>, p0: i32, p1: i32) -> u32 { 257 | var c0: vec3; 258 | var c1: vec3; 259 | dec_rgb565(&c0, p0); 260 | dec_rgb565(&c1, p1); 261 | 262 | var dir: vec3; 263 | for (var p = 0u; p < 3u; p++) { 264 | dir[p] = c1[p] - c0[p]; 265 | } 266 | 267 | var sq_norm = 0.0; 268 | for (var p = 0u; p < 3u; p++) { 269 | sq_norm += sq(dir[p]); 270 | } 271 | 272 | let rsq_norm = rcp(sq_norm); 273 | 274 | for (var p = 0u; p < 3u; p++) { 275 | dir[p] *= rsq_norm * 3.0; 276 | } 277 | 278 | var bias = 0.5; 279 | for (var p = 0u; p < 3u; p++) { 280 | bias -= c0[p] * dir[p]; 281 | } 282 | 283 | var bits = 0u; 284 | var scaler = 1u; 285 | for (var k = 0u; k < 16u; k++) { 286 | var dot = 0.0; 287 | for (var p = 0u; p < 3u; p++) { 288 | dot += (*block)[k + p * 16u] * dir[p]; 289 | } 290 | 291 | let q = clamp(i32(dot + bias), 0, 3); 292 | bits += u32(q) * scaler; 293 | scaler *= 4u; 294 | } 295 | 296 | return bits; 297 | } 298 | 299 | fn bc1_refine(pe: ptr>, block: ptr>, bits: u32, dc: ptr>) { 300 | var c0: vec3; 301 | var c1: vec3; 302 | 303 | if ((bits ^ (bits * 4u)) < 4u) { 304 | for (var p = 0u; p < 3u; p++) { 305 | c0[p] = (*dc)[p]; 306 | c1[p] = (*dc)[p]; 307 | } 308 | } else { 309 | var atb1: vec3; 310 | var sum_q = 0.0; 311 | var sum_qq = 0.0; 312 | var shifted_bits = bits; 313 | 314 | for (var k = 0u; k < 16u; k++) { 315 | let q = f32(shifted_bits & 3u); 316 | shifted_bits = shifted_bits >> 2u; 317 | 318 | let x = 3.0 - q; 319 | 320 | sum_q += q; 321 | sum_qq += q * q; 322 | 323 | for (var p = 0u; p < 3u; p++) { 324 | atb1[p] += x * (*block)[k + p * 16u]; 325 | } 326 | } 327 | 328 | var sum: vec3; 329 | var atb2: vec3; 330 | 331 | for (var p = 0u; p < 3u; p++) { 332 | sum[p] = (*dc)[p] * 16.0; 333 | atb2[p] = 3.0 * sum[p] - atb1[p]; 334 | } 335 | 336 | let cxx = 16.0 * sq(3.0) - 2.0 * 3.0 * sum_q + sum_qq; 337 | let cyy = sum_qq; 338 | let cxy = 3.0 * sum_q - sum_qq; 339 | let scale = 3.0 * rcp(cxx * cyy - cxy * cxy); 340 | 341 | for (var p = 0u; p < 3u; p++) { 342 | c0[p] = (atb1[p] * cyy - atb2[p] * cxy) * scale; 343 | c1[p] = (atb2[p] * cxx - atb1[p] * cxy) * scale; 344 | 345 | c0[p] = clamp(c0[p], 0.0, 255.0); 346 | c1[p] = clamp(c1[p], 0.0, 255.0); 347 | } 348 | } 349 | 350 | (*pe)[0] = enc_rgb565(&c0); 351 | (*pe)[1] = enc_rgb565(&c1); 352 | } 353 | 354 | fn fix_qbits(qbits: u32) -> u32 { 355 | const MASK_01B: u32 = 0x55555555u; 356 | const MASK_10B: u32 = 0xAAAAAAAAu; 357 | 358 | let qbits0 = qbits & MASK_01B; 359 | let qbits1 = qbits & MASK_10B; 360 | return (qbits1 >> 1u) + (qbits1 ^ (qbits0 << 1u)); 361 | } 362 | 363 | fn compress_block_bc1_core(block: ptr>) -> vec2 { 364 | let power_iterations = 4; 365 | let refine_iterations = 1; 366 | 367 | var covar: array; 368 | var dc: vec3; 369 | compute_covar_dc(&covar, &dc, block); 370 | 371 | const eps = 0.001; 372 | covar[0] += eps; 373 | covar[3] += eps; 374 | covar[5] += eps; 375 | 376 | var axis: vec3; 377 | compute_axis3(&axis, &covar, power_iterations); 378 | 379 | var c0: vec3; 380 | var c1: vec3; 381 | pick_endpoints(&c0, &c1, block, &axis, &dc); 382 | 383 | var p: vec2; 384 | p[0] = enc_rgb565(&c0); 385 | p[1] = enc_rgb565(&c1); 386 | if (p[0] < p[1]) { 387 | let temp = p[0]; 388 | p[0] = p[1]; 389 | p[1] = temp; 390 | } 391 | 392 | var data: vec2; 393 | data[0] = (u32(p[1]) << 16u) | u32(p[0]); 394 | data[1] = fast_quant(block, p[0], p[1]); 395 | 396 | for (var i = 0; i < refine_iterations; i++) { 397 | bc1_refine(&p, block, data[1], &dc); 398 | if (p[0] < p[1]) { 399 | let temp = p[0]; 400 | p[0] = p[1]; 401 | p[1] = temp; 402 | } 403 | data[0] = (u32(p[1]) << 16u) | u32(p[0]); 404 | data[1] = fast_quant(block, p[0], p[1]); 405 | } 406 | 407 | data[1] = fix_qbits(data[1]); 408 | return data; 409 | } 410 | 411 | fn compress_block_bc3_alpha(block: ptr>) -> vec2 { 412 | var ep = vec2(255.0, 0.0); 413 | 414 | // Find min/max endpoints using block[48] to block[63] for alpha 415 | for (var k: u32 = 0u; k < 16u; k++) { 416 | ep[0] = min(ep[0], (*block)[48 + k]); 417 | ep[1] = max(ep[1], (*block)[48 + k]); 418 | } 419 | 420 | // Prevent division by zero 421 | if (ep[0] == ep[1]) { 422 | ep[1] = ep[0] + 0.1; 423 | } 424 | 425 | var qblock: vec2; 426 | let scale = 7.0 / (ep[1] - ep[0]); 427 | 428 | for (var k: u32 = 0u; k < 16u; k++) { 429 | let v = (*block)[48u + k]; 430 | let proj = (v - ep[0]) * scale + 0.5; 431 | 432 | var q = clamp(i32(proj), 0, 7); 433 | q = 7 - q; 434 | 435 | if (q > 0) { 436 | q += 1; 437 | } 438 | if (q == 8) { 439 | q = 1; 440 | } 441 | 442 | qblock[k / 8u] |= u32(q) << ((k % 8u) * 3u); 443 | } 444 | 445 | var data: vec2; 446 | data[0] = (clamp(u32(ep[0]), 0u, 255u) << 8u) | clamp(u32(ep[1]), 0u, 255u); 447 | data[0] |= qblock[0] << 16u; 448 | data[1] = qblock[0] >> 16u; 449 | data[1] |= qblock[1] << 8u; 450 | 451 | return data; 452 | } 453 | 454 | @compute 455 | @workgroup_size(8, 8) 456 | fn compress_bc1(@builtin(global_invocation_id) global_id: vec3) { 457 | let xx = global_id.x; 458 | let yy = global_id.y; 459 | 460 | let block_width = (uniforms.width + 3u) / 4u; 461 | let block_height = (uniforms.height + 3u) / 4u; 462 | 463 | if (xx >= block_width || yy >= block_height) { 464 | return; 465 | } 466 | 467 | var block: array; 468 | var compressed_data: vec2; 469 | 470 | load_block_interleaved_rgba(&block, xx, yy); 471 | 472 | let color_result = compress_block_bc1_core(&block); 473 | compressed_data[0] = color_result[0]; 474 | compressed_data[1] = color_result[1]; 475 | 476 | store_data_2(block_width, xx, yy, compressed_data); 477 | } 478 | 479 | @compute 480 | @workgroup_size(8, 8) 481 | fn compress_bc2(@builtin(global_invocation_id) global_id: vec3) { 482 | let xx = global_id.x; 483 | let yy = global_id.y; 484 | 485 | let block_width = (uniforms.width + 3u) / 4u; 486 | let block_height = (uniforms.height + 3u) / 4u; 487 | 488 | if (xx >= block_width || yy >= block_height) { 489 | return; 490 | } 491 | 492 | var block: array; 493 | var compressed_data: vec4; 494 | 495 | let alpha_result = load_block_alpha_4bit(xx, yy); 496 | compressed_data[0] = alpha_result[0]; 497 | compressed_data[1] = alpha_result[1]; 498 | 499 | load_block_interleaved_rgba(&block, xx, yy); 500 | 501 | let color_result = compress_block_bc1_core(&block); 502 | compressed_data[2] = color_result[0]; 503 | compressed_data[3] = color_result[1]; 504 | 505 | store_data_4(block_width, xx, yy, compressed_data); 506 | } 507 | 508 | @compute 509 | @workgroup_size(8, 8) 510 | fn compress_bc3(@builtin(global_invocation_id) global_id: vec3) { 511 | let xx = global_id.x; 512 | let yy = global_id.y; 513 | 514 | let block_width = (uniforms.width + 3u) / 4u; 515 | let block_height = (uniforms.height + 3u) / 4u; 516 | 517 | if (xx >= block_width || yy >= block_height) { 518 | return; 519 | } 520 | 521 | var block: array; 522 | var compressed_data: vec4; 523 | 524 | load_block_interleaved_rgba(&block, xx, yy); 525 | 526 | let alpha_result = compress_block_bc3_alpha(&block); 527 | compressed_data[0] = alpha_result[0]; 528 | compressed_data[1] = alpha_result[1]; 529 | 530 | let color_result = compress_block_bc1_core(&block); 531 | compressed_data[2] = color_result[0]; 532 | compressed_data[3] = color_result[1]; 533 | 534 | store_data_4(block_width, xx, yy, compressed_data); 535 | } 536 | 537 | @compute 538 | @workgroup_size(8, 8) 539 | fn compress_bc4(@builtin(global_invocation_id) global_id: vec3) { 540 | let xx = global_id.x; 541 | let yy = global_id.y; 542 | 543 | let block_width = (uniforms.width + 3u) / 4u; 544 | let block_height = (uniforms.height + 3u) / 4u; 545 | 546 | if (xx >= block_width || yy >= block_height) { 547 | return; 548 | } 549 | 550 | var block: array; 551 | var compressed_data: vec2; 552 | 553 | load_block_r_8bit(&block, xx, yy); 554 | 555 | let color_result = compress_block_bc3_alpha(&block); 556 | compressed_data[0] = color_result[0]; 557 | compressed_data[1] = color_result[1]; 558 | 559 | store_data_2(block_width, xx, yy, compressed_data); 560 | } 561 | 562 | @compute 563 | @workgroup_size(8, 8) 564 | fn compress_bc5(@builtin(global_invocation_id) global_id: vec3) { 565 | let xx = global_id.x; 566 | let yy = global_id.y; 567 | 568 | let block_width = (uniforms.width + 3u) / 4u; 569 | let block_height = (uniforms.height + 3u) / 4u; 570 | 571 | if (xx >= block_width || yy >= block_height) { 572 | return; 573 | } 574 | 575 | var block: array; 576 | var compressed_data: vec4; 577 | 578 | load_block_r_8bit(&block, xx, yy); 579 | 580 | let red_result = compress_block_bc3_alpha(&block); 581 | compressed_data[0] = red_result[0]; 582 | compressed_data[1] = red_result[1]; 583 | 584 | load_block_g_8bit(&block, xx, yy); 585 | 586 | let green_result = compress_block_bc3_alpha(&block); 587 | compressed_data[2] = green_result[0]; 588 | compressed_data[3] = green_result[1]; 589 | 590 | store_data_4(block_width, xx, yy, compressed_data); 591 | } 592 | -------------------------------------------------------------------------------- /src/encode/common.rs: -------------------------------------------------------------------------------- 1 | #[inline(always)] 2 | pub(crate) const fn sq(x: f32) -> f32 { 3 | x * x 4 | } 5 | 6 | pub(crate) fn get_unquant_value(bits: u32, index: i32) -> i32 { 7 | match bits { 8 | 2 => { 9 | const TABLE: [i32; 16] = [0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 10 | TABLE[index as usize] 11 | } 12 | 3 => { 13 | const TABLE: [i32; 16] = [0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0]; 14 | TABLE[index as usize] 15 | } 16 | _ => { 17 | const TABLE: [i32; 16] = [0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64]; 18 | TABLE[index as usize] 19 | } 20 | } 21 | } 22 | 23 | pub(crate) fn get_pattern(part_id: i32) -> u32 { 24 | const PATTERN_TABLE: [u32; 128] = [ 25 | 0x50505050, 0x40404040, 0x54545454, 0x54505040, 0x50404000, 0x55545450, 0x55545040, 26 | 0x54504000, 0x50400000, 0x55555450, 0x55544000, 0x54400000, 0x55555440, 0x55550000, 27 | 0x55555500, 0x55000000, 0x55150100, 0x00004054, 0x15010000, 0x00405054, 0x00004050, 28 | 0x15050100, 0x05010000, 0x40505054, 0x00404050, 0x05010100, 0x14141414, 0x05141450, 29 | 0x01155440, 0x00555500, 0x15014054, 0x05414150, 0x44444444, 0x55005500, 0x11441144, 30 | 0x05055050, 0x05500550, 0x11114444, 0x41144114, 0x44111144, 0x15055054, 0x01055040, 31 | 0x05041050, 0x05455150, 0x14414114, 0x50050550, 0x41411414, 0x00141400, 0x00041504, 32 | 0x00105410, 0x10541000, 0x04150400, 0x50410514, 0x41051450, 0x05415014, 0x14054150, 33 | 0x41050514, 0x41505014, 0x40011554, 0x54150140, 0x50505500, 0x00555050, 0x15151010, 34 | 0x54540404, 0xAA685050, 0x6A5A5040, 0x5A5A4200, 0x5450A0A8, 0xA5A50000, 0xA0A05050, 35 | 0x5555A0A0, 0x5A5A5050, 0xAA550000, 0xAA555500, 0xAAAA5500, 0x90909090, 0x94949494, 36 | 0xA4A4A4A4, 0xA9A59450, 0x2A0A4250, 0xA5945040, 0x0A425054, 0xA5A5A500, 0x55A0A0A0, 37 | 0xA8A85454, 0x6A6A4040, 0xA4A45000, 0x1A1A0500, 0x0050A4A4, 0xAAA59090, 0x14696914, 38 | 0x69691400, 0xA08585A0, 0xAA821414, 0x50A4A450, 0x6A5A0200, 0xA9A58000, 0x5090A0A8, 39 | 0xA8A09050, 0x24242424, 0x00AA5500, 0x24924924, 0x24499224, 0x50A50A50, 0x500AA550, 40 | 0xAAAA4444, 0x66660000, 0xA5A0A5A0, 0x50A050A0, 0x69286928, 0x44AAAA44, 0x66666600, 41 | 0xAA444444, 0x54A854A8, 0x95809580, 0x96969600, 0xA85454A8, 0x80959580, 0xAA141414, 42 | 0x96960000, 0xAAAA1414, 0xA05050A0, 0xA0A5A5A0, 0x96000000, 0x40804080, 0xA9A8A9A8, 43 | 0xAAAAAA44, 0x2A4A5254, 44 | ]; 45 | 46 | PATTERN_TABLE[part_id as usize] 47 | } 48 | 49 | pub(crate) fn get_pattern_mask(part_id: i32, j: u32) -> u32 { 50 | const PATTERN_MASK_TABLE: [u32; 128] = [ 51 | 0xCCCC3333, 0x88887777, 0xEEEE1111, 0xECC81337, 0xC880377F, 0xFEEC0113, 0xFEC80137, 52 | 0xEC80137F, 0xC80037FF, 0xFFEC0013, 0xFE80017F, 0xE80017FF, 0xFFE80017, 0xFF0000FF, 53 | 0xFFF0000F, 0xF0000FFF, 0xF71008EF, 0x008EFF71, 0x71008EFF, 0x08CEF731, 0x008CFF73, 54 | 0x73108CEF, 0x3100CEFF, 0x8CCE7331, 0x088CF773, 0x3110CEEF, 0x66669999, 0x366CC993, 55 | 0x17E8E817, 0x0FF0F00F, 0x718E8E71, 0x399CC663, 0xAAAA5555, 0xF0F00F0F, 0x5A5AA5A5, 56 | 0x33CCCC33, 0x3C3CC3C3, 0x55AAAA55, 0x96966969, 0xA55A5AA5, 0x73CE8C31, 0x13C8EC37, 57 | 0x324CCDB3, 0x3BDCC423, 0x69969669, 0xC33C3CC3, 0x99666699, 0x0660F99F, 0x0272FD8D, 58 | 0x04E4FB1B, 0x4E40B1BF, 0x2720D8DF, 0xC93636C9, 0x936C6C93, 0x39C6C639, 0x639C9C63, 59 | 0x93366CC9, 0x9CC66339, 0x817E7E81, 0xE71818E7, 0xCCF0330F, 0x0FCCF033, 0x774488BB, 60 | 0xEE2211DD, 0x08CC0133, 0x8CC80037, 0xCC80006F, 0xEC001331, 0x330000FF, 0x00CC3333, 61 | 0xFF000033, 0xCCCC0033, 0x0F0000FF, 0x0FF0000F, 0x00F0000F, 0x44443333, 0x66661111, 62 | 0x22221111, 0x136C0013, 0x008C8C63, 0x36C80137, 0x08CEC631, 0x3330000F, 0xF0000333, 63 | 0x00EE1111, 0x88880077, 0x22C0113F, 0x443088CF, 0x0C22F311, 0x03440033, 0x69969009, 64 | 0x9960009F, 0x03303443, 0x00660699, 0xC22C3113, 0x8C0000EF, 0x1300007F, 0xC4003331, 65 | 0x004C1333, 0x22229999, 0x00F0F00F, 0x24929249, 0x29429429, 0xC30C30C3, 0xC03C3C03, 66 | 0x00AA0055, 0xAA0000FF, 0x30300303, 0xC0C03333, 0x90900909, 0xA00A5005, 0xAAA0000F, 67 | 0x0AAA0555, 0xE0E01111, 0x70700707, 0x6660000F, 0x0EE01111, 0x07707007, 0x06660999, 68 | 0x660000FF, 0x00660099, 0x0CC03333, 0x03303003, 0x60000FFF, 0x80807777, 0x10100101, 69 | 0x000A0005, 0x08CE8421, 70 | ]; 71 | 72 | let mask_packed = PATTERN_MASK_TABLE[part_id as usize]; 73 | let mask0 = mask_packed & 0xFFFF; 74 | let mask1 = mask_packed >> 16; 75 | 76 | if j == 2 { 77 | !mask0 & !mask1 78 | } else if j == 0 { 79 | mask0 80 | } else { 81 | mask1 82 | } 83 | } 84 | 85 | pub(crate) fn get_skips(part_id: i32) -> [u32; 3] { 86 | const SKIP_TABLE: [u32; 128] = [ 87 | 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 88 | 0xF0, 0xF0, 0x20, 0x80, 0x20, 0x20, 0x80, 0x80, 0xF0, 0x20, 0x80, 0x20, 0x20, 0x80, 0x80, 89 | 0x20, 0x20, 0xF0, 0xF0, 0x60, 0x80, 0x20, 0x80, 0xF0, 0xF0, 0x20, 0x80, 0x20, 0x20, 0x20, 90 | 0xF0, 0xF0, 0x60, 0x60, 0x20, 0x60, 0x80, 0xF0, 0xF0, 0x20, 0x20, 0xF0, 0xF0, 0xF0, 0xF0, 91 | 0xF0, 0x20, 0x20, 0xF0, 0x3F, 0x38, 0xF8, 0xF3, 0x8F, 0x3F, 0xF3, 0xF8, 0x8F, 0x8F, 0x6F, 92 | 0x6F, 0x6F, 0x5F, 0x3F, 0x38, 0x3F, 0x38, 0x8F, 0xF3, 0x3F, 0x38, 0x6F, 0xA8, 0x53, 0x8F, 93 | 0x86, 0x6A, 0x8F, 0x5F, 0xFA, 0xF8, 0x8F, 0xF3, 0x3F, 0x5A, 0x6A, 0xA8, 0x89, 0xFA, 0xF6, 94 | 0x3F, 0xF8, 0x5F, 0xF3, 0xF6, 0xF6, 0xF8, 0x3F, 0xF3, 0x5F, 0x5F, 0x5F, 0x8F, 0x5F, 0xAF, 95 | 0x5F, 0xAF, 0x8F, 0xDF, 0xF3, 0xCF, 0x3F, 0x38, 96 | ]; 97 | 98 | let skip_packed = SKIP_TABLE[part_id as usize]; 99 | 100 | [0, skip_packed >> 4, skip_packed & 15] 101 | } 102 | 103 | pub(crate) fn put_bits(data: &mut [u32; 5], pos: &mut u32, bits: u32, v: u32) { 104 | data[(*pos / 32) as usize] |= v << (*pos % 32); 105 | if *pos % 32 + bits > 32 { 106 | data[(*pos / 32 + 1) as usize] |= v >> (32 - *pos % 32); 107 | } 108 | *pos += bits; 109 | } 110 | 111 | pub(crate) fn data_shl_1bit_from(data: &mut [u32; 5], from_bits: usize) { 112 | if from_bits < 96 { 113 | let shifted = (data[2] >> 1) | (data[3] << 31); 114 | let mask = ((1 << (from_bits - 64)) - 1) >> 1; 115 | data[2] = (mask & data[2]) | (!mask & shifted); 116 | data[3] = (data[3] >> 1) | (data[4] << 31); 117 | data[4] >>= 1; 118 | } else if from_bits < 128 { 119 | let shifted = (data[3] >> 1) | (data[4] << 31); 120 | let mask = ((1 << (from_bits - 96)) - 1) >> 1; 121 | data[3] = (mask & data[3]) | (!mask & shifted); 122 | data[4] >>= 1; 123 | } 124 | } 125 | 126 | pub(crate) fn partial_sort_list(list: &mut [i32], length: usize, partial_count: u32) { 127 | for k in 0..partial_count as usize { 128 | let mut best_idx = k; 129 | let mut best_value = list[k]; 130 | 131 | for i in k + 1..length { 132 | if best_value > list[i] { 133 | best_value = list[i]; 134 | best_idx = i; 135 | } 136 | } 137 | 138 | list.swap(k, best_idx); 139 | } 140 | } 141 | 142 | pub(crate) fn opt_endpoints( 143 | ep: &mut [f32], 144 | block: &[f32; 64], 145 | bits: u32, 146 | qblock: [u32; 2], 147 | mask: u32, 148 | channels: usize, 149 | ) { 150 | let levels = 1 << bits; 151 | 152 | let mut atb1 = [0.0; 4]; 153 | let mut sum_q = 0.0; 154 | let mut sum_qq = 0.0; 155 | let mut sum = [0.0; 5]; 156 | 157 | let mut mask_shifted = mask << 1; 158 | for k1 in 0..2 { 159 | let mut qbits_shifted = qblock[k1]; 160 | for k2 in 0..8 { 161 | let k = k1 * 8 + k2; 162 | let q = (qbits_shifted & 15) as f32; 163 | qbits_shifted >>= 4; 164 | 165 | mask_shifted >>= 1; 166 | if (mask_shifted & 1) == 0 { 167 | continue; 168 | } 169 | 170 | let x = (levels - 1) as f32 - q; 171 | 172 | sum_q += q; 173 | sum_qq += q * q; 174 | 175 | sum[4] += 1.0; 176 | for p in 0..channels { 177 | sum[p] += block[k + p * 16]; 178 | atb1[p] += x * block[k + p * 16]; 179 | } 180 | } 181 | } 182 | 183 | let mut atb2 = [0.0; 4]; 184 | for p in 0..channels { 185 | atb2[p] = (levels - 1) as f32 * sum[p] - atb1[p]; 186 | } 187 | 188 | let cxx = sum[4] * sq((levels - 1) as f32) - 2.0 * (levels - 1) as f32 * sum_q + sum_qq; 189 | let cyy = sum_qq; 190 | let cxy = (levels - 1) as f32 * sum_q - sum_qq; 191 | let scale = (levels - 1) as f32 / (cxx * cyy - cxy * cxy); 192 | 193 | for p in 0..channels { 194 | ep[p] = (atb1[p] * cyy - atb2[p] * cxy) * scale; 195 | ep[4 + p] = (atb2[p] * cxx - atb1[p] * cxy) * scale; 196 | } 197 | 198 | if f32::abs(cxx * cyy - cxy * cxy) < 0.001 { 199 | // flatten 200 | for p in 0..channels { 201 | ep[p] = sum[p] / sum[4]; 202 | ep[4 + p] = ep[p]; 203 | } 204 | } 205 | } 206 | 207 | // Principal Component Analysis (PCA) bound 208 | pub(crate) fn get_pca_bound(covar: &[f32; 10], channels: usize) -> f32 { 209 | const POWER_ITERATIONS: u32 = 4; // Quite approximative, but enough for bounding 210 | 211 | let mut covar_scaled = *covar; 212 | let inv_var = 1.0 / (256.0 * 256.0); 213 | for covar_scaled in covar_scaled.iter_mut() { 214 | *covar_scaled *= inv_var; 215 | } 216 | 217 | const EPS: f32 = sq(0.001); 218 | covar_scaled[0] += EPS; 219 | covar_scaled[4] += EPS; 220 | covar_scaled[7] += EPS; 221 | 222 | let mut axis = [0.0; 4]; 223 | compute_axis(&mut axis, &covar_scaled, POWER_ITERATIONS, channels); 224 | 225 | let mut a_vec = [0.0; 4]; 226 | if channels == 3 { 227 | ssymv3(&mut a_vec, &covar_scaled, &axis); 228 | } else if channels == 4 { 229 | ssymv4(&mut a_vec, &covar_scaled, &axis); 230 | } 231 | 232 | let mut sq_sum = 0.0; 233 | for &value in a_vec[..channels].iter() { 234 | sq_sum += sq(value); 235 | } 236 | let lambda = sq_sum.sqrt(); 237 | 238 | let mut bound = covar_scaled[0] + covar_scaled[4] + covar_scaled[7]; 239 | if channels == 4 { 240 | bound += covar_scaled[9]; 241 | } 242 | bound -= lambda; 243 | 244 | f32::max(bound, 0.0) 245 | } 246 | 247 | pub(crate) fn ssymv3(a: &mut [f32; 4], covar: &[f32; 10], b: &[f32; 4]) { 248 | a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2]; 249 | a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2]; 250 | a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2]; 251 | } 252 | 253 | pub(crate) fn ssymv4(a: &mut [f32; 4], covar: &[f32; 10], b: &[f32; 4]) { 254 | a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2] + covar[3] * b[3]; 255 | a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2] + covar[6] * b[3]; 256 | a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2] + covar[8] * b[3]; 257 | a[3] = covar[3] * b[0] + covar[6] * b[1] + covar[8] * b[2] + covar[9] * b[3]; 258 | } 259 | 260 | pub(crate) fn compute_axis( 261 | axis: &mut [f32; 4], 262 | covar: &[f32; 10], 263 | power_iterations: u32, 264 | channels: usize, 265 | ) { 266 | let mut a_vec = [1.0, 1.0, 1.0, 1.0]; 267 | 268 | for i in 0..power_iterations { 269 | if channels == 3 { 270 | ssymv3(axis, covar, &a_vec); 271 | } else if channels == 4 { 272 | ssymv4(axis, covar, &a_vec); 273 | } 274 | 275 | a_vec[..channels].copy_from_slice(&axis[..channels]); 276 | 277 | // Renormalize every other iteration 278 | if i % 2 == 1 { 279 | let mut norm_sq = 0.0; 280 | for p in 0..channels { 281 | norm_sq += sq(axis[p]); 282 | } 283 | 284 | let rnorm = 1.0 / norm_sq.sqrt(); 285 | for value in a_vec[..channels].iter_mut() { 286 | *value *= rnorm; 287 | } 288 | } 289 | } 290 | 291 | axis[..channels].copy_from_slice(&a_vec[..channels]); 292 | } 293 | 294 | pub(crate) fn compute_stats_masked( 295 | stats: &mut [f32; 15], 296 | block: &[f32; 64], 297 | mask: u32, 298 | channels: usize, 299 | ) { 300 | let mut mask_shifted = mask << 1; 301 | for k in 0..16 { 302 | mask_shifted >>= 1; 303 | let flag = (mask_shifted & 1) as f32; 304 | 305 | let mut rgba = [0.0; 4]; 306 | for p in 0..channels { 307 | rgba[p] = block[k + p * 16] * flag; 308 | } 309 | stats[14] += flag; 310 | 311 | stats[10] += rgba[0]; 312 | stats[11] += rgba[1]; 313 | stats[12] += rgba[2]; 314 | 315 | stats[0] += rgba[0] * rgba[0]; 316 | stats[1] += rgba[0] * rgba[1]; 317 | stats[2] += rgba[0] * rgba[2]; 318 | 319 | stats[4] += rgba[1] * rgba[1]; 320 | stats[5] += rgba[1] * rgba[2]; 321 | 322 | stats[7] += rgba[2] * rgba[2]; 323 | 324 | if channels == 4 { 325 | stats[13] += rgba[3]; 326 | stats[3] += rgba[0] * rgba[3]; 327 | stats[6] += rgba[1] * rgba[3]; 328 | stats[8] += rgba[2] * rgba[3]; 329 | stats[9] += rgba[3] * rgba[3]; 330 | } 331 | } 332 | } 333 | 334 | pub(crate) fn covar_from_stats(covar: &mut [f32; 10], stats: [f32; 15], channels: usize) { 335 | covar[0] = stats[0] - stats[10] * stats[10] / stats[14]; 336 | covar[1] = stats[1] - stats[10] * stats[11] / stats[14]; 337 | covar[2] = stats[2] - stats[10] * stats[12] / stats[14]; 338 | 339 | covar[4] = stats[4] - stats[11] * stats[11] / stats[14]; 340 | covar[5] = stats[5] - stats[11] * stats[12] / stats[14]; 341 | 342 | covar[7] = stats[7] - stats[12] * stats[12] / stats[14]; 343 | 344 | if channels == 4 { 345 | covar[3] = stats[3] - stats[10] * stats[13] / stats[14]; 346 | covar[6] = stats[6] - stats[11] * stats[13] / stats[14]; 347 | covar[8] = stats[8] - stats[12] * stats[13] / stats[14]; 348 | covar[9] = stats[9] - stats[13] * stats[13] / stats[14]; 349 | } 350 | } 351 | 352 | pub(crate) fn compute_covar_dc_masked( 353 | covar: &mut [f32; 10], 354 | dc: &mut [f32; 4], 355 | block: &[f32; 64], 356 | mask: u32, 357 | channels: usize, 358 | ) { 359 | let mut stats = [0.0; 15]; 360 | compute_stats_masked(&mut stats, block, mask, channels); 361 | 362 | // Calculate dc values from stats 363 | for p in 0..channels { 364 | dc[p] = stats[10 + p] / stats[14]; 365 | } 366 | 367 | covar_from_stats(covar, stats, channels); 368 | } 369 | 370 | pub(crate) fn block_pca_axis( 371 | axis: &mut [f32; 4], 372 | dc: &mut [f32; 4], 373 | block: &[f32; 64], 374 | mask: u32, 375 | channels: usize, 376 | ) { 377 | const POWER_ITERATIONS: u32 = 8; // 4 not enough for HQ 378 | 379 | let mut covar = [0.0; 10]; 380 | compute_covar_dc_masked(&mut covar, dc, block, mask, channels); 381 | 382 | const INV_VAR: f32 = 1.0 / (256.0 * 256.0); 383 | for covar in covar.iter_mut() { 384 | *covar *= INV_VAR; 385 | } 386 | 387 | const EPS: f32 = sq(0.001); 388 | covar[0] += EPS; 389 | covar[4] += EPS; 390 | covar[7] += EPS; 391 | covar[9] += EPS; 392 | 393 | compute_axis(axis, &covar, POWER_ITERATIONS, channels); 394 | } 395 | 396 | pub(crate) fn block_pca_bound_split( 397 | block: &[f32; 64], 398 | mask: u32, 399 | full_stats: [f32; 15], 400 | channels: usize, 401 | ) -> f32 { 402 | let mut stats = [0.0; 15]; 403 | compute_stats_masked(&mut stats, block, mask, channels); 404 | 405 | let mut covar1 = [0.0; 10]; 406 | covar_from_stats(&mut covar1, stats, channels); 407 | 408 | for i in 0..15 { 409 | stats[i] = full_stats[i] - stats[i]; 410 | } 411 | 412 | let mut covar2 = [0.0; 10]; 413 | covar_from_stats(&mut covar2, stats, channels); 414 | 415 | let mut bound = 0.0; 416 | bound += get_pca_bound(&covar1, channels); 417 | bound += get_pca_bound(&covar2, channels); 418 | 419 | bound.sqrt() * 256.0 420 | } 421 | 422 | pub(crate) fn block_quant( 423 | qblock: &mut [u32; 2], 424 | block: &[f32; 64], 425 | bits: u32, 426 | ep: &[f32], 427 | pattern: u32, 428 | channels: usize, 429 | ) -> f32 { 430 | let mut total_err = 0.0; 431 | let levels = 1 << bits; 432 | 433 | qblock[0] = 0; 434 | qblock[1] = 0; 435 | 436 | let mut pattern_shifted = pattern; 437 | for k in 0..16 { 438 | let j = (pattern_shifted & 3) as usize; 439 | pattern_shifted >>= 2; 440 | 441 | let mut proj = 0.0; 442 | let mut div = 0.0; 443 | for p in 0..channels { 444 | let ep_a = ep[8 * j + p]; 445 | let ep_b = ep[8 * j + 4 + p]; 446 | proj += (block[k + p * 16] - ep_a) * (ep_b - ep_a); 447 | div += sq(ep_b - ep_a); 448 | } 449 | 450 | proj /= div; 451 | 452 | let q1 = (proj * levels as f32 + 0.5) as i32; 453 | let q1_clamped = i32::clamp(q1, 1, levels - 1); 454 | 455 | let mut err0 = 0.0; 456 | let mut err1 = 0.0; 457 | let w0 = get_unquant_value(bits, q1_clamped - 1); 458 | let w1 = get_unquant_value(bits, q1_clamped); 459 | 460 | for p in 0..channels { 461 | let ep_a = ep[8 * j + p]; 462 | let ep_b = ep[8 * j + 4 + p]; 463 | let dec_v0 = (((64 - w0) * ep_a as i32 + w0 * ep_b as i32 + 32) / 64) as f32; 464 | let dec_v1 = (((64 - w1) * ep_a as i32 + w1 * ep_b as i32 + 32) / 64) as f32; 465 | err0 += sq(dec_v0 - block[k + p * 16]); 466 | err1 += sq(dec_v1 - block[k + p * 16]); 467 | } 468 | 469 | let mut best_err = err1; 470 | let mut best_q = q1_clamped; 471 | if err0 < err1 { 472 | best_err = err0; 473 | best_q = q1_clamped - 1; 474 | } 475 | 476 | qblock[k / 8] |= (best_q as u32) << (4 * (k % 8)); 477 | total_err += best_err; 478 | } 479 | 480 | total_err 481 | } 482 | 483 | pub(crate) fn block_segment_core(ep: &mut [f32], block: &[f32; 64], mask: u32, channels: usize) { 484 | let mut axis = [0.0; 4]; 485 | let mut dc = [0.0; 4]; 486 | block_pca_axis(&mut axis, &mut dc, block, mask, channels); 487 | 488 | let mut ext = [f32::INFINITY, f32::NEG_INFINITY]; 489 | 490 | // Find min/max 491 | let mut mask_shifted = mask << 1; 492 | for k in 0..16 { 493 | mask_shifted >>= 1; 494 | if (mask_shifted & 1) == 0 { 495 | continue; 496 | } 497 | 498 | let mut dot = 0.0; 499 | for p in 0..channels { 500 | dot += axis[p] * (block[16 * p + k] - dc[p]); 501 | } 502 | 503 | ext[0] = f32::min(ext[0], dot); 504 | ext[1] = f32::max(ext[1], dot); 505 | } 506 | 507 | // Create some distance if the endpoints collapse 508 | if ext[1] - ext[0] < 1.0 { 509 | ext[0] -= 0.5; 510 | ext[1] += 0.5; 511 | } 512 | 513 | for i in 0..2 { 514 | for p in 0..channels { 515 | ep[4 * i + p] = ext[i] * axis[p] + dc[p]; 516 | } 517 | } 518 | } 519 | 520 | pub(crate) fn bc7_code_qblock( 521 | data: &mut [u32; 5], 522 | qpos: &mut u32, 523 | qblock: [u32; 2], 524 | bits: u32, 525 | flips: u32, 526 | ) { 527 | let levels = 1 << bits; 528 | let mut flips_shifted = flips; 529 | 530 | for k1 in 0..2 { 531 | let mut qbits_shifted = qblock[k1]; 532 | for k2 in 0..8 { 533 | let mut q = qbits_shifted & 15; 534 | if (flips_shifted & 1) > 0 { 535 | q = (levels - 1) - q; 536 | } 537 | 538 | if k1 == 0 && k2 == 0 { 539 | put_bits(data, qpos, bits - 1, q); 540 | } else { 541 | put_bits(data, qpos, bits, q); 542 | } 543 | qbits_shifted >>= 4; 544 | flips_shifted >>= 1; 545 | } 546 | } 547 | } 548 | 549 | pub(crate) fn bc7_code_adjust_skip_mode01237(data: &mut [u32; 5], mode: usize, part_id: i32) { 550 | let pairs = if mode == 0 || mode == 2 { 3 } else { 2 }; 551 | let bits = if mode == 0 || mode == 1 { 3 } else { 2 }; 552 | 553 | let mut skips = get_skips(part_id); 554 | 555 | if pairs > 2 && skips[1] < skips[2] { 556 | skips.swap(1, 2); 557 | } 558 | 559 | for &k in skips[1..pairs].iter() { 560 | data_shl_1bit_from(data, 128 + (pairs - 1) - (15 - k as usize) * bits); 561 | } 562 | } 563 | 564 | pub(crate) fn bc7_code_apply_swap_mode456( 565 | qep: &mut [i32], 566 | channels: usize, 567 | qblock: &mut [u32; 2], 568 | bits: u32, 569 | ) { 570 | let levels = 1 << bits; 571 | 572 | if (qblock[0] & 15) >= levels / 2 { 573 | for p in 0..channels { 574 | qep.swap(p, channels + p); 575 | } 576 | 577 | for value in qblock.iter_mut() { 578 | *value = (0x11111111 * (levels - 1)) - *value; 579 | } 580 | } 581 | } 582 | 583 | pub(crate) fn bc7_code_apply_swap_mode01237( 584 | qep: &mut [i32; 24], 585 | qblock: [u32; 2], 586 | mode: usize, 587 | part_id: i32, 588 | ) -> u32 { 589 | let bits = if mode == 0 || mode == 1 { 3 } else { 2 }; 590 | let pairs = if mode == 0 || mode == 2 { 3 } else { 2 }; 591 | 592 | let mut flips = 0; 593 | let levels = 1 << bits; 594 | 595 | let skips = get_skips(part_id); 596 | 597 | for j in 0..pairs { 598 | let k0 = skips[j] as usize; 599 | // Extract 4 bits from qblock at position k0 600 | let q = (qblock[k0 >> 3] << (28 - (k0 & 7) * 4)) >> 28; 601 | 602 | if q >= levels / 2 { 603 | for p in 0..4 { 604 | qep.swap(8 * j + p, 8 * j + 4 + p); 605 | } 606 | 607 | let pmask = get_pattern_mask(part_id, j as u32); 608 | flips |= pmask; 609 | } 610 | } 611 | 612 | flips 613 | } 614 | -------------------------------------------------------------------------------- /src/block_compressor.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, num::NonZeroU64}; 2 | 3 | use bytemuck::{cast_slice, Pod, Zeroable}; 4 | use wgpu::{ 5 | self, include_wgsl, BindGroup, BindGroupDescriptor, BindGroupEntry, BindGroupLayout, 6 | BindGroupLayoutDescriptor, BindGroupLayoutEntry, BindingResource, BindingType, Buffer, 7 | BufferBinding, BufferBindingType, BufferDescriptor, BufferUsages, ComputePass, ComputePipeline, 8 | ComputePipelineDescriptor, Device, PipelineCompilationOptions, PipelineLayoutDescriptor, Queue, 9 | ShaderModule, ShaderRuntimeChecks, ShaderStages, TextureSampleType, TextureView, 10 | TextureViewDimension, 11 | }; 12 | 13 | #[cfg(feature = "bc6h")] 14 | use crate::BC6HSettings; 15 | #[cfg(feature = "bc7")] 16 | use crate::BC7Settings; 17 | use crate::CompressionVariant; 18 | 19 | #[derive(Copy, Clone, Zeroable, Pod)] 20 | #[repr(C)] 21 | struct Uniforms { 22 | /// The width of the image data. 23 | width: u32, 24 | /// The height of the image data. 25 | height: u32, 26 | /// Start row of the texture data we want to convert. 27 | texture_y_offset: u32, 28 | /// Start of the blocks data in u32 elements. 29 | blocks_offset: u32, 30 | } 31 | 32 | struct Task { 33 | variant: CompressionVariant, 34 | width: u32, 35 | height: u32, 36 | uniform_offset: u32, 37 | #[cfg(any(feature = "bc6h", feature = "bc7"))] 38 | setting_offset: u32, 39 | texture_y_offset: u32, 40 | buffer_offset: u32, 41 | texture_view: TextureView, 42 | buffer: Buffer, 43 | } 44 | 45 | /// Compresses texture data with a block compression algorithm using WGPU compute shader. 46 | pub struct GpuBlockCompressor { 47 | scratch_buffer: Vec, 48 | task: Vec, 49 | uniforms_buffer: Buffer, 50 | #[cfg(feature = "bc6h")] 51 | bc6h_settings_buffer: Buffer, 52 | #[cfg(feature = "bc7")] 53 | bc7_settings_buffer: Buffer, 54 | bind_group_layouts: HashMap, 55 | pipelines: HashMap, 56 | device: Device, 57 | queue: Queue, 58 | uniforms_aligned_size: usize, 59 | #[cfg(feature = "bc6h")] 60 | bc6h_aligned_size: usize, 61 | #[cfg(feature = "bc7")] 62 | bc7_aligned_size: usize, 63 | } 64 | 65 | impl GpuBlockCompressor { 66 | /// Creates a new block compressor instance. 67 | /// 68 | /// [`wgpu::Device`] and [`wgpu::Queue`] are internally reference counted and can cheaply and 69 | /// safely be cloned. 70 | pub fn new(device: Device, queue: Queue) -> Self { 71 | let limits = device.limits(); 72 | 73 | let alignment = limits.min_uniform_buffer_offset_alignment as usize; 74 | let size = size_of::(); 75 | let uniforms_aligned_size = size.div_ceil(alignment) * alignment; 76 | 77 | #[cfg(feature = "bc6h")] 78 | let bc6h_aligned_size = { 79 | let alignment = limits.min_storage_buffer_offset_alignment as usize; 80 | let size = size_of::(); 81 | size.div_ceil(alignment) * alignment 82 | }; 83 | 84 | #[cfg(feature = "bc7")] 85 | let bc7_aligned_size = { 86 | let alignment = limits.min_storage_buffer_offset_alignment as usize; 87 | let size = size_of::(); 88 | size.div_ceil(alignment) * alignment 89 | }; 90 | 91 | #[cfg(feature = "bc15")] 92 | let shader_module_bc1_to_5 = 93 | device.create_shader_module(include_wgsl!("shader/bc1_to_5.wgsl")); 94 | #[cfg(feature = "bc6h")] 95 | let shader_module_bc6h = device.create_shader_module(include_wgsl!("shader/bc6h.wgsl")); 96 | // The addition of the bounded loop in https://github.com/gfx-rs/wgpu/pull/7080 97 | // seems to cause the program to crash with AMD integrated GPU. 98 | #[cfg(feature = "bc7")] 99 | let shader_module_bc7 = { 100 | unsafe { 101 | let checks = ShaderRuntimeChecks { 102 | bounds_checks: true, 103 | force_loop_bounding: false, 104 | }; 105 | device.create_shader_module_trusted(include_wgsl!("shader/bc7.wgsl"), checks) 106 | } 107 | }; 108 | 109 | let uniforms_buffer = device.create_buffer(&BufferDescriptor { 110 | label: Some("uniforms"), 111 | size: (uniforms_aligned_size * 16) as _, 112 | usage: BufferUsages::COPY_DST | BufferUsages::UNIFORM, 113 | mapped_at_creation: false, 114 | }); 115 | 116 | #[cfg(feature = "bc6h")] 117 | let bc6h_settings_buffer = device.create_buffer(&BufferDescriptor { 118 | label: Some("bc6h settings"), 119 | size: (bc6h_aligned_size * 16) as _, 120 | usage: BufferUsages::COPY_DST | BufferUsages::STORAGE, 121 | mapped_at_creation: false, 122 | }); 123 | 124 | #[cfg(feature = "bc7")] 125 | let bc7_settings_buffer = device.create_buffer(&BufferDescriptor { 126 | label: Some("bc7 settings"), 127 | size: (bc7_aligned_size * 16) as _, 128 | usage: BufferUsages::COPY_DST | BufferUsages::STORAGE, 129 | mapped_at_creation: false, 130 | }); 131 | 132 | let mut bind_group_layouts = HashMap::new(); 133 | let mut pipelines = HashMap::new(); 134 | 135 | #[cfg(feature = "bc15")] 136 | Self::create_pipeline( 137 | &device, 138 | &shader_module_bc1_to_5, 139 | &mut bind_group_layouts, 140 | &mut pipelines, 141 | CompressionVariant::BC1, 142 | ); 143 | #[cfg(feature = "bc15")] 144 | Self::create_pipeline( 145 | &device, 146 | &shader_module_bc1_to_5, 147 | &mut bind_group_layouts, 148 | &mut pipelines, 149 | CompressionVariant::BC2, 150 | ); 151 | #[cfg(feature = "bc15")] 152 | Self::create_pipeline( 153 | &device, 154 | &shader_module_bc1_to_5, 155 | &mut bind_group_layouts, 156 | &mut pipelines, 157 | CompressionVariant::BC3, 158 | ); 159 | #[cfg(feature = "bc15")] 160 | Self::create_pipeline( 161 | &device, 162 | &shader_module_bc1_to_5, 163 | &mut bind_group_layouts, 164 | &mut pipelines, 165 | CompressionVariant::BC4, 166 | ); 167 | #[cfg(feature = "bc15")] 168 | Self::create_pipeline( 169 | &device, 170 | &shader_module_bc1_to_5, 171 | &mut bind_group_layouts, 172 | &mut pipelines, 173 | CompressionVariant::BC5, 174 | ); 175 | #[cfg(feature = "bc6h")] 176 | Self::create_pipeline( 177 | &device, 178 | &shader_module_bc6h, 179 | &mut bind_group_layouts, 180 | &mut pipelines, 181 | CompressionVariant::BC6H(BC6HSettings::basic()), 182 | ); 183 | #[cfg(feature = "bc7")] 184 | Self::create_pipeline( 185 | &device, 186 | &shader_module_bc7, 187 | &mut bind_group_layouts, 188 | &mut pipelines, 189 | CompressionVariant::BC7(BC7Settings::alpha_basic()), 190 | ); 191 | 192 | Self { 193 | scratch_buffer: Vec::default(), 194 | task: Vec::default(), 195 | uniforms_buffer, 196 | #[cfg(feature = "bc6h")] 197 | bc6h_settings_buffer, 198 | #[cfg(feature = "bc7")] 199 | bc7_settings_buffer, 200 | bind_group_layouts, 201 | pipelines, 202 | device, 203 | queue, 204 | uniforms_aligned_size, 205 | #[cfg(feature = "bc6h")] 206 | bc6h_aligned_size, 207 | #[cfg(feature = "bc7")] 208 | bc7_aligned_size, 209 | } 210 | } 211 | 212 | #[allow(unused_mut)] 213 | fn create_pipeline( 214 | device: &Device, 215 | shader_module: &ShaderModule, 216 | bind_group_layouts: &mut HashMap, 217 | pipelines: &mut HashMap, 218 | variant: CompressionVariant, 219 | ) { 220 | let mut layout_entries = vec![ 221 | BindGroupLayoutEntry { 222 | binding: 0, 223 | visibility: ShaderStages::COMPUTE, 224 | ty: BindingType::Texture { 225 | sample_type: TextureSampleType::Float { filterable: true }, 226 | view_dimension: TextureViewDimension::D2, 227 | multisampled: false, 228 | }, 229 | count: None, 230 | }, 231 | BindGroupLayoutEntry { 232 | binding: 1, 233 | visibility: ShaderStages::COMPUTE, 234 | ty: BindingType::Buffer { 235 | ty: BufferBindingType::Storage { read_only: false }, 236 | has_dynamic_offset: false, 237 | min_binding_size: None, 238 | }, 239 | count: None, 240 | }, 241 | BindGroupLayoutEntry { 242 | binding: 2, 243 | visibility: ShaderStages::COMPUTE, 244 | ty: BindingType::Buffer { 245 | ty: BufferBindingType::Uniform, 246 | has_dynamic_offset: true, 247 | min_binding_size: None, 248 | }, 249 | count: None, 250 | }, 251 | ]; 252 | 253 | match variant { 254 | #[cfg(feature = "bc6h")] 255 | CompressionVariant::BC6H(..) => { 256 | layout_entries.push(BindGroupLayoutEntry { 257 | binding: 3, 258 | visibility: ShaderStages::COMPUTE, 259 | ty: BindingType::Buffer { 260 | ty: BufferBindingType::Storage { read_only: true }, 261 | has_dynamic_offset: true, 262 | min_binding_size: NonZeroU64::new(size_of::() as _), 263 | }, 264 | count: None, 265 | }); 266 | } 267 | #[cfg(feature = "bc7")] 268 | CompressionVariant::BC7(..) => { 269 | layout_entries.push(BindGroupLayoutEntry { 270 | binding: 3, 271 | visibility: ShaderStages::COMPUTE, 272 | ty: BindingType::Buffer { 273 | ty: BufferBindingType::Storage { read_only: true }, 274 | has_dynamic_offset: true, 275 | min_binding_size: NonZeroU64::new(size_of::() as _), 276 | }, 277 | count: None, 278 | }); 279 | } 280 | #[allow(unreachable_patterns)] 281 | _ => {} 282 | } 283 | 284 | let name = variant.name(); 285 | 286 | let bind_group_layout = device.create_bind_group_layout(&BindGroupLayoutDescriptor { 287 | label: Some(&format!("{name} bind group layout")), 288 | entries: &layout_entries, 289 | }); 290 | 291 | let pipeline_layout = device.create_pipeline_layout(&PipelineLayoutDescriptor { 292 | label: Some(&format!("{name} block compression pipeline layout")), 293 | bind_group_layouts: &[&bind_group_layout], 294 | push_constant_ranges: &[], 295 | }); 296 | 297 | let pipeline = device.create_compute_pipeline(&ComputePipelineDescriptor { 298 | label: Some(&format!("{name} block compression pipeline")), 299 | layout: Some(&pipeline_layout), 300 | module: shader_module, 301 | entry_point: Some(variant.entry_point()), 302 | compilation_options: PipelineCompilationOptions::default(), 303 | cache: None, 304 | }); 305 | 306 | bind_group_layouts.insert(variant, bind_group_layout); 307 | pipelines.insert(variant, pipeline); 308 | } 309 | 310 | /// Adds a texture compression task to the queue. 311 | /// 312 | /// This API is designed to be very flexible. For example, it is possible to fill the mip map 313 | /// levels of a texture with multiple calls to this function. 314 | /// 315 | /// # Texture View Requirements 316 | /// The source texture should provide enough channels for the texture compression. If only a 317 | /// single red channel is provided and BC1 is used, only the red channel will be properly 318 | /// encoded. All texture compression need to work on the raw texture data. The texture can 319 | /// use a sRGB texture format, but it needs to provide a view with a non-sRGB texture format. 320 | /// For example for a texture with a `Rgba8UnormSrgb` texture format, you will need to provide 321 | /// a texture view with the `Rgba8Unorm` format. 322 | /// 323 | /// BC1, 2, 3, 4, 5 and 7 expect to work on an `unorm` format. `Rgba8Unorm` should be correct 324 | /// for 99.9% of cases. 325 | /// 326 | /// BC6H needs an `unorm` or `float` format. `Rgba16Float` is optimal for HDR textures. 327 | /// Colors should be in linear space and not in sRGBA space. 328 | /// 329 | /// # Buffer Requirements 330 | /// The destination buffer must have sufficient capacity to store the compressed blocks at the 331 | /// specified offset. The required size can be calculated using 332 | /// [`CompressionVariant::blocks_byte_size()`]. 333 | /// 334 | /// For example: 335 | /// 336 | /// ```ignore 337 | /// let required_size = variant.blocks_byte_size(width, height); 338 | /// let total_size = offset + required_size; 339 | /// assert!(buffer.size() >= total_size); 340 | /// ``` 341 | /// 342 | /// # Arguments 343 | /// * `variant` - The block compression format to use 344 | /// * `texture_view` - View into the source texture to compress 345 | /// * `width` - Width of the texture view in pixels 346 | /// * `height` - Height of the texture view in pixels 347 | /// * `buffer` - Destination storage buffer for the compressed data 348 | /// * `texture_y_offset` - Optional offset in pixel rows into the source texture 349 | /// * `blocks_offset` - Optional offset in bytes into the destination buffer 350 | /// 351 | /// # Panics 352 | /// - If `width` or `height` or `texture_y_offset`, if set, is not a multiple of 4 353 | /// - If the destination `buffer` is not a storage buffer 354 | /// - If the destination `buffer` is too small to hold the compressed blocks at the specified offset 355 | #[allow(clippy::too_many_arguments)] 356 | pub fn add_compression_task( 357 | &mut self, 358 | variant: CompressionVariant, 359 | texture_view: &TextureView, 360 | width: u32, 361 | height: u32, 362 | buffer: &Buffer, 363 | texture_y_offset: Option, 364 | blocks_offset: Option, 365 | ) { 366 | assert_eq!(height % 4, 0); 367 | assert_eq!(width % 4, 0); 368 | 369 | if let Some(texture_y_offset) = texture_y_offset { 370 | assert_eq!(texture_y_offset % 4, 0); 371 | } 372 | 373 | assert!( 374 | buffer.usage().contains(BufferUsages::STORAGE), 375 | "buffer needs to be a storage buffer" 376 | ); 377 | 378 | let required_size = variant.blocks_byte_size(width, height); 379 | let total_size = blocks_offset.unwrap_or(0) as usize + required_size; 380 | 381 | assert!( 382 | buffer.size() as usize >= total_size, 383 | "buffer size ({}) is too small to hold compressed blocks at offset {}. Required size: {}", 384 | buffer.size(), 385 | blocks_offset.unwrap_or(0), 386 | total_size 387 | ); 388 | 389 | self.task.push(Task { 390 | variant, 391 | width, 392 | height, 393 | uniform_offset: 0, 394 | #[cfg(any(feature = "bc6h", feature = "bc7"))] 395 | setting_offset: 0, 396 | texture_y_offset: texture_y_offset.unwrap_or(0), 397 | buffer_offset: blocks_offset.unwrap_or(0), 398 | texture_view: texture_view.clone(), 399 | buffer: buffer.clone(), 400 | }); 401 | } 402 | 403 | fn update_buffer_sizes(&mut self) { 404 | let total_uniforms_size = self.uniforms_aligned_size * self.task.len(); 405 | if total_uniforms_size > self.uniforms_buffer.size() as usize { 406 | self.uniforms_buffer = self.device.create_buffer(&BufferDescriptor { 407 | label: Some("uniforms buffer"), 408 | size: total_uniforms_size as u64, 409 | usage: BufferUsages::COPY_DST | BufferUsages::UNIFORM, 410 | mapped_at_creation: false, 411 | }); 412 | } 413 | 414 | #[cfg(feature = "bc6h")] 415 | { 416 | let bc6_setting_count = self 417 | .task 418 | .iter() 419 | .filter(|task| matches!(task.variant, CompressionVariant::BC6H(..))) 420 | .count(); 421 | 422 | let total_bc6h_size = self.bc6h_aligned_size * bc6_setting_count; 423 | if total_bc6h_size > self.bc6h_settings_buffer.size() as usize { 424 | self.bc6h_settings_buffer = self.device.create_buffer(&BufferDescriptor { 425 | label: Some("bc6h settings buffer"), 426 | size: total_bc6h_size as u64, 427 | usage: BufferUsages::COPY_DST | BufferUsages::STORAGE, 428 | mapped_at_creation: false, 429 | }); 430 | } 431 | } 432 | 433 | #[cfg(feature = "bc7")] 434 | { 435 | let bc7_setting_count = self 436 | .task 437 | .iter() 438 | .filter(|task| matches!(task.variant, CompressionVariant::BC7(..))) 439 | .count(); 440 | 441 | let total_bc7_size = self.bc7_aligned_size * bc7_setting_count; 442 | if total_bc7_size > self.bc7_settings_buffer.size() as usize { 443 | self.bc7_settings_buffer = self.device.create_buffer(&BufferDescriptor { 444 | label: Some("bc7 settings buffer"), 445 | size: total_bc7_size as u64, 446 | usage: BufferUsages::COPY_DST | BufferUsages::STORAGE, 447 | mapped_at_creation: false, 448 | }); 449 | } 450 | } 451 | } 452 | 453 | fn upload(&mut self) { 454 | self.scratch_buffer.clear(); 455 | for (index, task) in self.task.iter_mut().enumerate() { 456 | let offset = index * self.uniforms_aligned_size; 457 | task.uniform_offset = offset as u32; 458 | 459 | let uniforms = Uniforms { 460 | width: task.width, 461 | height: task.height, 462 | texture_y_offset: task.texture_y_offset, 463 | blocks_offset: task.buffer_offset / 4, 464 | }; 465 | 466 | self.scratch_buffer 467 | .resize(offset + self.uniforms_aligned_size, 0); 468 | self.scratch_buffer[offset..offset + size_of::()] 469 | .copy_from_slice(cast_slice(&[uniforms])); 470 | } 471 | if !self.scratch_buffer.is_empty() { 472 | if let Some(mut data) = self.queue.write_buffer_with( 473 | &self.uniforms_buffer, 474 | 0, 475 | NonZeroU64::new(self.scratch_buffer.len() as u64).unwrap(), 476 | ) { 477 | data.copy_from_slice(&self.scratch_buffer); 478 | } 479 | } 480 | 481 | #[cfg(feature = "bc6h")] 482 | { 483 | self.scratch_buffer.clear(); 484 | for (index, (settings, task)) in self 485 | .task 486 | .iter_mut() 487 | .filter_map(|task| { 488 | #[allow(irrefutable_let_patterns)] 489 | if let CompressionVariant::BC6H(settings) = task.variant { 490 | Some((settings, task)) 491 | } else { 492 | None 493 | } 494 | }) 495 | .enumerate() 496 | { 497 | let offset = index * self.bc6h_aligned_size; 498 | task.setting_offset = offset as u32; 499 | self.scratch_buffer 500 | .resize(offset + self.bc6h_aligned_size, 0); 501 | self.scratch_buffer[offset..offset + size_of::()] 502 | .copy_from_slice(cast_slice(&[settings])); 503 | } 504 | if !self.scratch_buffer.is_empty() { 505 | if let Some(mut data) = self.queue.write_buffer_with( 506 | &self.bc6h_settings_buffer, 507 | 0, 508 | NonZeroU64::new(self.scratch_buffer.len() as u64).unwrap(), 509 | ) { 510 | data.copy_from_slice(&self.scratch_buffer); 511 | } 512 | } 513 | } 514 | 515 | #[cfg(feature = "bc7")] 516 | { 517 | self.scratch_buffer.clear(); 518 | for (index, (settings, task)) in self 519 | .task 520 | .iter_mut() 521 | .filter_map(|task| { 522 | #[allow(irrefutable_let_patterns)] 523 | if let CompressionVariant::BC7(settings) = task.variant { 524 | Some((settings, task)) 525 | } else { 526 | None 527 | } 528 | }) 529 | .enumerate() 530 | { 531 | let offset = index * self.bc7_aligned_size; 532 | task.setting_offset = offset as u32; 533 | self.scratch_buffer 534 | .resize(offset + self.bc7_aligned_size, 0); 535 | self.scratch_buffer[offset..offset + size_of::()] 536 | .copy_from_slice(cast_slice(&[settings])); 537 | } 538 | if !self.scratch_buffer.is_empty() { 539 | if let Some(mut data) = self.queue.write_buffer_with( 540 | &self.bc7_settings_buffer, 541 | 0, 542 | NonZeroU64::new(self.scratch_buffer.len() as u64).unwrap(), 543 | ) { 544 | data.copy_from_slice(&self.scratch_buffer); 545 | } 546 | } 547 | } 548 | } 549 | 550 | /// Will upload all dispatch data and then dispatches all compression tasks to the GPU. 551 | /// 552 | /// # Arguments 553 | /// * `pass` - The compute pass to record commands into 554 | pub fn compress(&mut self, pass: &mut ComputePass) { 555 | self.update_buffer_sizes(); 556 | self.upload(); 557 | 558 | let mut bind_groups: Vec = self 559 | .task 560 | .iter() 561 | .map(|task| self.create_bind_group(task)) 562 | .collect(); 563 | 564 | for (task, bind_group) in self.task.drain(..).zip(bind_groups.drain(..)) { 565 | let pipeline = self 566 | .pipelines 567 | .get(&task.variant) 568 | .expect("can't find pipeline for variant"); 569 | 570 | pass.set_pipeline(pipeline); 571 | 572 | match task.variant { 573 | #[cfg(feature = "bc6h")] 574 | CompressionVariant::BC6H(..) => { 575 | pass.set_bind_group( 576 | 0, 577 | &bind_group, 578 | &[task.uniform_offset, task.setting_offset], 579 | ); 580 | } 581 | #[cfg(feature = "bc7")] 582 | CompressionVariant::BC7(..) => { 583 | pass.set_bind_group( 584 | 0, 585 | &bind_group, 586 | &[task.uniform_offset, task.setting_offset], 587 | ); 588 | } 589 | #[allow(irrefutable_let_patterns)] 590 | #[allow(unreachable_patterns)] 591 | _ => { 592 | pass.set_bind_group(0, &bind_group, &[task.uniform_offset]); 593 | } 594 | } 595 | 596 | let block_width = task.width.div_ceil(4); 597 | let block_height = task.height.div_ceil(4); 598 | 599 | let workgroup_width = block_width.div_ceil(8); 600 | let workgroup_height = block_height.div_ceil(8); 601 | 602 | pass.dispatch_workgroups(workgroup_width, workgroup_height, 1); 603 | } 604 | } 605 | 606 | fn create_bind_group(&self, task: &Task) -> BindGroup { 607 | let bind_group_layout = self 608 | .bind_group_layouts 609 | .get(&task.variant) 610 | .expect("Can't find bind group layout for variant"); 611 | 612 | match task.variant { 613 | #[cfg(feature = "bc15")] 614 | CompressionVariant::BC1 615 | | CompressionVariant::BC2 616 | | CompressionVariant::BC3 617 | | CompressionVariant::BC4 618 | | CompressionVariant::BC5 => self.device.create_bind_group(&BindGroupDescriptor { 619 | label: Some("bind group"), 620 | layout: bind_group_layout, 621 | entries: &[ 622 | BindGroupEntry { 623 | binding: 0, 624 | resource: BindingResource::TextureView(&task.texture_view), 625 | }, 626 | BindGroupEntry { 627 | binding: 1, 628 | resource: task.buffer.as_entire_binding(), 629 | }, 630 | BindGroupEntry { 631 | binding: 2, 632 | resource: BindingResource::Buffer(BufferBinding { 633 | buffer: &self.uniforms_buffer, 634 | offset: 0, 635 | size: Some(NonZeroU64::new(self.uniforms_aligned_size as u64).unwrap()), 636 | }), 637 | }, 638 | ], 639 | }), 640 | #[cfg(feature = "bc6h")] 641 | CompressionVariant::BC6H(..) => self.device.create_bind_group(&BindGroupDescriptor { 642 | label: Some("bind group"), 643 | layout: bind_group_layout, 644 | entries: &[ 645 | BindGroupEntry { 646 | binding: 0, 647 | resource: BindingResource::TextureView(&task.texture_view), 648 | }, 649 | BindGroupEntry { 650 | binding: 1, 651 | resource: task.buffer.as_entire_binding(), 652 | }, 653 | BindGroupEntry { 654 | binding: 2, 655 | resource: BindingResource::Buffer(BufferBinding { 656 | buffer: &self.uniforms_buffer, 657 | offset: 0, 658 | size: Some(NonZeroU64::new(self.uniforms_aligned_size as u64).unwrap()), 659 | }), 660 | }, 661 | BindGroupEntry { 662 | binding: 3, 663 | resource: BindingResource::Buffer(BufferBinding { 664 | buffer: &self.bc6h_settings_buffer, 665 | offset: 0, 666 | size: Some(NonZeroU64::new(self.bc6h_aligned_size as u64).unwrap()), 667 | }), 668 | }, 669 | ], 670 | }), 671 | #[cfg(feature = "bc7")] 672 | CompressionVariant::BC7(..) => self.device.create_bind_group(&BindGroupDescriptor { 673 | label: Some("bind group"), 674 | layout: bind_group_layout, 675 | entries: &[ 676 | BindGroupEntry { 677 | binding: 0, 678 | resource: BindingResource::TextureView(&task.texture_view), 679 | }, 680 | BindGroupEntry { 681 | binding: 1, 682 | resource: task.buffer.as_entire_binding(), 683 | }, 684 | BindGroupEntry { 685 | binding: 2, 686 | resource: BindingResource::Buffer(BufferBinding { 687 | buffer: &self.uniforms_buffer, 688 | offset: 0, 689 | size: Some(NonZeroU64::new(self.uniforms_aligned_size as u64).unwrap()), 690 | }), 691 | }, 692 | BindGroupEntry { 693 | binding: 3, 694 | resource: BindingResource::Buffer(BufferBinding { 695 | buffer: &self.bc7_settings_buffer, 696 | offset: 0, 697 | size: Some(NonZeroU64::new(self.bc7_aligned_size as u64).unwrap()), 698 | }), 699 | }, 700 | ], 701 | }), 702 | } 703 | } 704 | } 705 | -------------------------------------------------------------------------------- /src/encode/bc7.rs: -------------------------------------------------------------------------------- 1 | use super::common::*; 2 | use crate::BC7Settings; 3 | 4 | #[derive(Default)] 5 | struct Mode45Parameters { 6 | qep: [i32; 8], 7 | qblock: [u32; 2], 8 | aqep: [i32; 2], 9 | aqblock: [u32; 2], 10 | rotation: u32, 11 | swap: u32, 12 | } 13 | 14 | pub(crate) struct BlockCompressorBC7<'a> { 15 | block: [f32; 64], 16 | data: [u32; 5], 17 | best_err: f32, 18 | opaque_err: f32, 19 | settings: &'a BC7Settings, 20 | } 21 | 22 | #[inline(always)] 23 | const fn sq(x: f32) -> f32 { 24 | x * x 25 | } 26 | 27 | impl<'a> BlockCompressorBC7<'a> { 28 | pub(crate) fn new(settings: &'a BC7Settings) -> Self { 29 | Self { 30 | block: [0.0; 64], 31 | data: [0; 5], 32 | best_err: f32::INFINITY, 33 | opaque_err: 0.0, 34 | settings, 35 | } 36 | } 37 | 38 | pub(crate) fn load_block_interleaved_rgba( 39 | &mut self, 40 | rgba_data: &[u8], 41 | xx: usize, 42 | yy: usize, 43 | stride: usize, 44 | ) { 45 | for y in 0..4 { 46 | for x in 0..4 { 47 | let pixel_x = xx * 4 + x; 48 | let pixel_y = yy * 4 + y; 49 | 50 | let offset = pixel_y * stride + pixel_x * 4; 51 | 52 | let red = rgba_data[offset] as f32; 53 | let green = rgba_data[offset + 1] as f32; 54 | let blue = rgba_data[offset + 2] as f32; 55 | let alpha = rgba_data[offset + 3] as f32; 56 | 57 | self.block[y * 4 + x] = red; 58 | self.block[16 + y * 4 + x] = green; 59 | self.block[32 + y * 4 + x] = blue; 60 | self.block[48 + y * 4 + x] = alpha; 61 | } 62 | } 63 | } 64 | 65 | pub(crate) fn store_data( 66 | &self, 67 | blocks_buffer: &mut [u8], 68 | block_width: usize, 69 | xx: usize, 70 | yy: usize, 71 | ) { 72 | let offset = (yy * block_width + xx) * 16; 73 | 74 | for (index, &value) in self.data[..4].iter().enumerate() { 75 | let byte_offset = offset + index * 4; 76 | blocks_buffer[byte_offset] = value as u8; 77 | blocks_buffer[byte_offset + 1] = (value >> 8) as u8; 78 | blocks_buffer[byte_offset + 2] = (value >> 16) as u8; 79 | blocks_buffer[byte_offset + 3] = (value >> 24) as u8; 80 | } 81 | } 82 | 83 | fn unpack_to_byte(v: i32, bits: u32) -> i32 { 84 | let vv = v << (8 - bits); 85 | vv + (vv >> bits) 86 | } 87 | 88 | fn ep_quant0367(qep: &mut [i32], ep: &[f32], mode: usize, channels: usize) { 89 | let bits = if mode == 0 { 90 | 4 91 | } else if mode == 7 { 92 | 5 93 | } else { 94 | 7 95 | }; 96 | let levels = 1 << bits; 97 | let levels2 = levels * 2 - 1; 98 | 99 | for i in 0..2 { 100 | let mut qep_b = [0; 8]; 101 | 102 | for b in 0..2 { 103 | for p in 0..4 { 104 | let v = ((ep[i * 4 + p] / 255.0 * levels2 as f32 - b as f32) / 2.0 + 0.5) 105 | as i32 106 | * 2 107 | + b as i32; 108 | qep_b[b * 4 + p] = i32::clamp(v, b as i32, levels2 - 1 + b as i32); 109 | } 110 | } 111 | 112 | let mut ep_b = [0.0; 8]; 113 | for j in 0..8 { 114 | ep_b[j] = qep_b[j] as f32; 115 | } 116 | 117 | if mode == 0 { 118 | for j in 0..8 { 119 | ep_b[j] = Self::unpack_to_byte(qep_b[j], 5) as f32; 120 | } 121 | } 122 | 123 | let mut err0 = 0.0; 124 | let mut err1 = 0.0; 125 | for p in 0..channels { 126 | err0 += sq(ep[i * 4 + p] - ep_b[p]); 127 | err1 += sq(ep[i * 4 + p] - ep_b[4 + p]); 128 | } 129 | 130 | for p in 0..4 { 131 | qep[i * 4 + p] = if err0 < err1 { qep_b[p] } else { qep_b[4 + p] }; 132 | } 133 | } 134 | } 135 | 136 | fn ep_quant1(qep: &mut [i32], ep: &mut [f32]) { 137 | let mut qep_b = [0; 16]; 138 | 139 | for b in 0..2 { 140 | for i in 0..8 { 141 | let v = ((ep[i] / 255.0 * 127.0 - b as f32) / 2.0 + 0.5) as i32 * 2 + b as i32; 142 | qep_b[b * 8 + i] = i32::clamp(v, b as i32, 126 + b as i32); 143 | } 144 | } 145 | 146 | // dequant 147 | let mut ep_b = [0.0; 16]; 148 | for k in 0..16 { 149 | ep_b[k] = Self::unpack_to_byte(qep_b[k], 7) as f32; 150 | } 151 | 152 | let mut err0 = 0.0; 153 | let mut err1 = 0.0; 154 | for j in 0..2 { 155 | for p in 0..3 { 156 | err0 += sq(ep[j * 4 + p] - ep_b[j * 4 + p]); 157 | err1 += sq(ep[j * 4 + p] - ep_b[8 + j * 4 + p]); 158 | } 159 | } 160 | 161 | for i in 0..8 { 162 | qep[i] = if err0 < err1 { qep_b[i] } else { qep_b[8 + i] }; 163 | } 164 | } 165 | 166 | fn ep_quant245(qep: &mut [i32], ep: &[f32], mode: usize) { 167 | let bits = if mode == 5 { 7 } else { 5 }; 168 | 169 | let levels = 1 << bits; 170 | 171 | for i in 0..8 { 172 | let v = (ep[i] / 255.0 * (levels - 1) as f32 + 0.5) as i32; 173 | qep[i] = i32::clamp(v, 0, levels - 1); 174 | } 175 | } 176 | 177 | fn ep_quant(qep: &mut [i32], ep: &mut [f32], mode: usize, channels: usize) { 178 | const PAIRS_TABLE: [usize; 8] = [3, 2, 3, 2, 1, 1, 1, 2]; 179 | let pairs = PAIRS_TABLE[mode]; 180 | 181 | if mode == 0 || mode == 3 || mode == 6 || mode == 7 { 182 | for i in 0..pairs { 183 | Self::ep_quant0367(&mut qep[i * 8..], &ep[i * 8..], mode, channels); 184 | } 185 | } else if mode == 1 { 186 | for i in 0..pairs { 187 | Self::ep_quant1(&mut qep[i * 8..], &mut ep[i * 8..]); 188 | } 189 | } else if mode == 2 || mode == 4 || mode == 5 { 190 | for i in 0..pairs { 191 | Self::ep_quant245(&mut qep[i * 8..], &ep[i * 8..], mode); 192 | } 193 | } 194 | } 195 | 196 | fn ep_dequant(ep: &mut [f32], qep: &[i32], mode: usize) { 197 | const PAIRS_TABLE: [usize; 8] = [3, 2, 3, 2, 1, 1, 1, 2]; 198 | let pairs = PAIRS_TABLE[mode]; 199 | 200 | // mode 3, 6 are 8-bit 201 | if mode == 3 || mode == 6 { 202 | for i in 0..8 * pairs { 203 | ep[i] = qep[i] as f32; 204 | } 205 | } else if mode == 1 || mode == 5 { 206 | for i in 0..8 * pairs { 207 | ep[i] = Self::unpack_to_byte(qep[i], 7) as f32; 208 | } 209 | } else if mode == 0 || mode == 2 || mode == 4 { 210 | for i in 0..8 * pairs { 211 | ep[i] = Self::unpack_to_byte(qep[i], 5) as f32; 212 | } 213 | } else if mode == 7 { 214 | for i in 0..8 * pairs { 215 | ep[i] = Self::unpack_to_byte(qep[i], 6) as f32; 216 | } 217 | } 218 | } 219 | 220 | fn ep_quant_dequant(qep: &mut [i32], ep: &mut [f32], mode: usize, channels: usize) { 221 | Self::ep_quant(qep, ep, mode, channels); 222 | Self::ep_dequant(ep, qep, mode); 223 | } 224 | 225 | fn opt_channel( 226 | &self, 227 | qblock: &mut [u32; 2], 228 | qep: &mut [i32; 2], 229 | channel_block: &[f32; 16], 230 | bits: u32, 231 | epbits: u32, 232 | ) -> f32 { 233 | let mut ep = [255.0, 0.0]; 234 | 235 | for k in 0..16 { 236 | ep[0] = f32::min(ep[0], channel_block[k]); 237 | ep[1] = f32::max(ep[1], channel_block[k]); 238 | } 239 | 240 | Self::channel_quant_dequant(qep, &mut ep, epbits); 241 | let mut err = Self::channel_opt_quant(qblock, channel_block, bits, &ep); 242 | 243 | // Refine 244 | let refine_iterations = self.settings.refine_iterations_channel; 245 | for _ in 0..refine_iterations { 246 | Self::channel_opt_endpoints(&mut ep, channel_block, bits, *qblock); 247 | Self::channel_quant_dequant(qep, &mut ep, epbits); 248 | err = Self::channel_opt_quant(qblock, channel_block, bits, &ep); 249 | } 250 | 251 | err 252 | } 253 | 254 | fn channel_quant_dequant(qep: &mut [i32; 2], ep: &mut [f32; 2], epbits: u32) { 255 | let elevels = 1 << epbits; 256 | 257 | for i in 0..2 { 258 | let v = (ep[i] / 255.0 * (elevels - 1) as f32 + 0.5) as i32; 259 | qep[i] = i32::clamp(v, 0, elevels - 1); 260 | ep[i] = Self::unpack_to_byte(qep[i], epbits) as f32; 261 | } 262 | } 263 | 264 | fn channel_opt_quant( 265 | qblock: &mut [u32; 2], 266 | channel_block: &[f32; 16], 267 | bits: u32, 268 | ep: &[f32; 2], 269 | ) -> f32 { 270 | let levels = 1 << bits; 271 | 272 | qblock[0] = 0; 273 | qblock[1] = 0; 274 | 275 | let mut total_err = 0.0; 276 | 277 | for k in 0..16 { 278 | let proj = (channel_block[k] - ep[0]) / (ep[1] - ep[0] + 0.001); 279 | 280 | let q1 = (proj * levels as f32 + 0.5) as i32; 281 | let q1_clamped = i32::clamp(q1, 1, levels - 1); 282 | 283 | let mut err0 = 0.0; 284 | let mut err1 = 0.0; 285 | let w0 = get_unquant_value(bits, q1_clamped - 1); 286 | let w1 = get_unquant_value(bits, q1_clamped); 287 | 288 | let dec_v0 = (((64 - w0) * ep[0] as i32 + w0 * ep[1] as i32 + 32) / 64) as f32; 289 | let dec_v1 = (((64 - w1) * ep[0] as i32 + w1 * ep[1] as i32 + 32) / 64) as f32; 290 | err0 += sq(dec_v0 - channel_block[k]); 291 | err1 += sq(dec_v1 - channel_block[k]); 292 | 293 | let best_err = if err0 < err1 { err0 } else { err1 }; 294 | 295 | let best_q = if err0 < err1 { 296 | q1_clamped - 1 297 | } else { 298 | q1_clamped 299 | }; 300 | 301 | qblock[k / 8] |= (best_q as u32) << (4 * (k % 8)); 302 | total_err += best_err; 303 | } 304 | 305 | total_err 306 | } 307 | 308 | fn channel_opt_endpoints( 309 | ep: &mut [f32; 2], 310 | channel_block: &[f32; 16], 311 | bits: u32, 312 | qblock: [u32; 2], 313 | ) { 314 | let levels = 1 << bits; 315 | 316 | let mut atb1 = 0.0; 317 | let mut sum_q = 0.0; 318 | let mut sum_qq = 0.0; 319 | let mut sum = 0.0; 320 | 321 | for k1 in 0..2 { 322 | let mut qbits_shifted = qblock[k1]; 323 | for k2 in 0..8 { 324 | let k = k1 * 8 + k2; 325 | let q = (qbits_shifted & 15) as f32; 326 | qbits_shifted >>= 4; 327 | 328 | let x = (levels - 1) as f32 - q; 329 | 330 | sum_q += q; 331 | sum_qq += q * q; 332 | 333 | sum += channel_block[k]; 334 | atb1 += x * channel_block[k]; 335 | } 336 | } 337 | 338 | let atb2 = (levels - 1) as f32 * sum - atb1; 339 | 340 | let cxx = 16.0 * sq((levels - 1) as f32) - 2.0 * (levels - 1) as f32 * sum_q + sum_qq; 341 | let cyy = sum_qq; 342 | let cxy = (levels - 1) as f32 * sum_q - sum_qq; 343 | let scale = (levels - 1) as f32 / (cxx * cyy - cxy * cxy); 344 | 345 | ep[0] = (atb1 * cyy - atb2 * cxy) * scale; 346 | ep[1] = (atb2 * cxx - atb1 * cxy) * scale; 347 | 348 | ep[0] = f32::clamp(ep[0], 0.0, 255.0); 349 | ep[1] = f32::clamp(ep[1], 0.0, 255.0); 350 | 351 | if f32::abs(cxx * cyy - cxy * cxy) < 0.001 { 352 | ep[0] = sum / 16.0; 353 | ep[1] = ep[0]; 354 | } 355 | } 356 | 357 | pub(crate) fn block_segment(ep: &mut [f32], block: &[f32; 64], mask: u32, channels: usize) { 358 | block_segment_core(ep, block, mask, channels); 359 | 360 | for i in 0..2 { 361 | for p in 0..channels { 362 | ep[4 * i + p] = f32::clamp(ep[4 * i + p], 0.0, 255.0); 363 | } 364 | } 365 | } 366 | 367 | fn bc7_code_mode01237( 368 | &mut self, 369 | qep: &mut [i32; 24], 370 | qblock: [u32; 2], 371 | part_id: i32, 372 | mode: usize, 373 | ) { 374 | let bits = if mode == 0 || mode == 1 { 3 } else { 2 }; 375 | let pairs = if mode == 0 || mode == 2 { 3 } else { 2 }; 376 | let channels = if mode == 7 { 4 } else { 3 }; 377 | 378 | let flips = bc7_code_apply_swap_mode01237(qep, qblock, mode, part_id); 379 | 380 | self.data = [0; 5]; 381 | let mut pos = 0; 382 | 383 | // Mode 0-3, 7 384 | put_bits(&mut self.data, &mut pos, (mode + 1) as u32, 1 << mode); 385 | 386 | // Partition 387 | if mode == 0 { 388 | put_bits(&mut self.data, &mut pos, 4, (part_id & 15) as u32); 389 | } else { 390 | put_bits(&mut self.data, &mut pos, 6, (part_id & 63) as u32); 391 | } 392 | 393 | // Endpoints 394 | for p in 0..channels { 395 | for j in 0..pairs * 2 { 396 | if mode == 0 { 397 | put_bits(&mut self.data, &mut pos, 4, (qep[j * 4 + p] as u32) >> 1); 398 | } else if mode == 1 { 399 | put_bits(&mut self.data, &mut pos, 6, (qep[j * 4 + p] as u32) >> 1); 400 | } else if mode == 2 { 401 | put_bits(&mut self.data, &mut pos, 5, qep[j * 4 + p] as u32); 402 | } else if mode == 3 { 403 | put_bits(&mut self.data, &mut pos, 7, (qep[j * 4 + p] as u32) >> 1); 404 | } else if mode == 7 { 405 | put_bits(&mut self.data, &mut pos, 5, (qep[j * 4 + p] as u32) >> 1); 406 | } 407 | } 408 | } 409 | 410 | // P bits 411 | if mode == 1 { 412 | for j in 0..2 { 413 | put_bits(&mut self.data, &mut pos, 1, (qep[j * 8] as u32) & 1); 414 | } 415 | } 416 | 417 | if mode == 0 || mode == 3 || mode == 7 { 418 | for j in 0..pairs * 2 { 419 | put_bits(&mut self.data, &mut pos, 1, (qep[j * 4] as u32) & 1); 420 | } 421 | } 422 | 423 | // Quantized values 424 | bc7_code_qblock(&mut self.data, &mut pos, qblock, bits, flips); 425 | bc7_code_adjust_skip_mode01237(&mut self.data, mode, part_id); 426 | } 427 | 428 | fn bc7_code_mode45(&mut self, params: &Mode45Parameters, mode: usize) { 429 | let mut qep = params.qep; 430 | let mut qblock = params.qblock; 431 | let mut aqep = params.aqep; 432 | let mut aqblock = params.aqblock; 433 | let rotation = params.rotation; 434 | let swap = params.swap; 435 | 436 | let bits = 2; 437 | let abits = if mode == 4 { 3 } else { 2 }; 438 | let epbits = if mode == 4 { 5 } else { 7 }; 439 | let aepbits = if mode == 4 { 6 } else { 8 }; 440 | 441 | if swap == 0 { 442 | bc7_code_apply_swap_mode456(&mut qep, 4, &mut qblock, bits); 443 | bc7_code_apply_swap_mode456(&mut aqep, 1, &mut aqblock, abits); 444 | } else { 445 | std::mem::swap(&mut qblock, &mut aqblock); 446 | 447 | bc7_code_apply_swap_mode456(&mut aqep, 1, &mut qblock, bits); 448 | bc7_code_apply_swap_mode456(&mut qep, 4, &mut aqblock, abits); 449 | } 450 | 451 | // Clear state data 452 | self.data = [0; 5]; 453 | let mut pos = 0; 454 | 455 | // Mode 4-5 456 | put_bits(&mut self.data, &mut pos, (mode + 1) as u32, 1 << mode); 457 | 458 | // Rotation 459 | put_bits(&mut self.data, &mut pos, 2, (rotation + 1) & 3); 460 | 461 | if mode == 4 { 462 | put_bits(&mut self.data, &mut pos, 1, swap); 463 | } 464 | 465 | // Endpoints 466 | for p in 0..3 { 467 | put_bits(&mut self.data, &mut pos, epbits, qep[p] as u32); 468 | put_bits(&mut self.data, &mut pos, epbits, qep[4 + p] as u32); 469 | } 470 | 471 | // Alpha endpoints 472 | put_bits(&mut self.data, &mut pos, aepbits, aqep[0] as u32); 473 | put_bits(&mut self.data, &mut pos, aepbits, aqep[1] as u32); 474 | 475 | // Quantized values 476 | bc7_code_qblock(&mut self.data, &mut pos, qblock, bits, 0); 477 | bc7_code_qblock(&mut self.data, &mut pos, aqblock, abits, 0); 478 | } 479 | 480 | fn bc7_code_mode6(&mut self, qep: &mut [i32], qblock: &mut [u32; 2]) { 481 | bc7_code_apply_swap_mode456(qep, 4, qblock, 4); 482 | 483 | self.data = [0; 5]; 484 | let mut pos = 0; 485 | 486 | // Mode 6 487 | put_bits(&mut self.data, &mut pos, 7, 64); 488 | 489 | // Endpoints 490 | for p in 0..4 { 491 | put_bits(&mut self.data, &mut pos, 7, (qep[p] as u32) >> 1); 492 | put_bits(&mut self.data, &mut pos, 7, (qep[4 + p] as u32) >> 1); 493 | } 494 | 495 | // P bits 496 | put_bits(&mut self.data, &mut pos, 1, (qep[0] as u32) & 1); 497 | put_bits(&mut self.data, &mut pos, 1, (qep[4] as u32) & 1); 498 | 499 | // Quantized values 500 | bc7_code_qblock(&mut self.data, &mut pos, *qblock, 4, 0); 501 | } 502 | 503 | fn bc7_enc_mode01237_part_fast( 504 | &self, 505 | qep: &mut [i32; 24], 506 | qblock: &mut [u32; 2], 507 | part_id: i32, 508 | mode: usize, 509 | ) -> f32 { 510 | let pattern = get_pattern(part_id); 511 | let bits = if mode == 0 || mode == 1 { 3 } else { 2 }; 512 | let pairs = if mode == 0 || mode == 2 { 3 } else { 2 }; 513 | let channels = if mode == 7 { 4 } else { 3 }; 514 | 515 | let mut ep = [0.0; 24]; 516 | for j in 0..pairs { 517 | let mask = get_pattern_mask(part_id, j as u32); 518 | Self::block_segment(&mut ep[j * 8..], &self.block, mask, channels); 519 | } 520 | 521 | Self::ep_quant_dequant(qep, &mut ep, mode, channels); 522 | 523 | block_quant(qblock, &self.block, bits, &ep, pattern, channels) 524 | } 525 | 526 | fn bc7_enc_mode01237(&mut self, mode: usize, part_list: &[i32; 64], part_count: usize) { 527 | if part_count == 0 { 528 | return; 529 | } 530 | 531 | let bits = if mode == 0 || mode == 1 { 3 } else { 2 }; 532 | let pairs = if mode == 0 || mode == 2 { 3 } else { 2 }; 533 | let channels = if mode == 7 { 4 } else { 3 }; 534 | 535 | let mut best_qep = [0; 24]; 536 | let mut best_qblock = [0; 2]; 537 | let mut best_part_id = -1; 538 | let mut best_err = f32::INFINITY; 539 | 540 | for &part in part_list[..part_count].iter() { 541 | let mut part_id = part & 63; 542 | part_id = if pairs == 3 { part_id + 64 } else { part_id }; 543 | 544 | let mut qep = [0; 24]; 545 | let mut qblock = [0; 2]; 546 | let err = self.bc7_enc_mode01237_part_fast(&mut qep, &mut qblock, part_id, mode); 547 | 548 | if err < best_err { 549 | best_qep[..(8 * pairs)].copy_from_slice(&qep[..(8 * pairs)]); 550 | best_qblock.copy_from_slice(&qblock); 551 | 552 | best_part_id = part_id; 553 | best_err = err; 554 | } 555 | } 556 | 557 | let refine_iterations = self.settings.refine_iterations[mode]; 558 | for _ in 0..refine_iterations { 559 | let mut ep = [0.0; 24]; 560 | for j in 0..pairs { 561 | let mask = get_pattern_mask(best_part_id, j as u32); 562 | opt_endpoints( 563 | &mut ep[j * 8..], 564 | &self.block, 565 | bits, 566 | best_qblock, 567 | mask, 568 | channels, 569 | ); 570 | } 571 | 572 | let mut qep = [0; 24]; 573 | let mut qblock = [0; 2]; 574 | 575 | Self::ep_quant_dequant(&mut qep, &mut ep, mode, channels); 576 | 577 | let pattern = get_pattern(best_part_id); 578 | let err = block_quant(&mut qblock, &self.block, bits, &ep, pattern, channels); 579 | 580 | if err < best_err { 581 | best_qep[..(8 * pairs)].copy_from_slice(&qep[..(8 * pairs)]); 582 | best_qblock.copy_from_slice(&qblock); 583 | 584 | best_err = err; 585 | } 586 | } 587 | 588 | if mode != 7 { 589 | best_err += self.opaque_err; 590 | } 591 | 592 | if best_err < self.best_err { 593 | self.best_err = best_err; 594 | self.bc7_code_mode01237(&mut best_qep, best_qblock, best_part_id, mode); 595 | } 596 | } 597 | 598 | fn bc7_enc_mode02(&mut self) { 599 | let part_list: [i32; 64] = std::array::from_fn(|part| part as i32); 600 | 601 | self.bc7_enc_mode01237(0, &part_list, 16); 602 | 603 | if self.settings.skip_mode2 == 0 { 604 | self.bc7_enc_mode01237(2, &part_list, 64); 605 | } 606 | } 607 | 608 | fn bc7_enc_mode13(&mut self) { 609 | if self.settings.fast_skip_threshold_mode1 == 0 610 | && self.settings.fast_skip_threshold_mode3 == 0 611 | { 612 | return; 613 | } 614 | 615 | let mut full_stats = [0.0; 15]; 616 | compute_stats_masked(&mut full_stats, &self.block, 0xFFFFFFFF, 3); 617 | 618 | let mut part_list = [0; 64]; 619 | for part in 0..64 { 620 | let mask = get_pattern_mask(part, 0); 621 | let bound12 = block_pca_bound_split(&self.block, mask, full_stats, 3); 622 | let bound = bound12 as i32; 623 | part_list[part as usize] = part + bound * 64; 624 | } 625 | 626 | let partial_count = u32::max( 627 | self.settings.fast_skip_threshold_mode1, 628 | self.settings.fast_skip_threshold_mode3, 629 | ); 630 | partial_sort_list(&mut part_list, 64, partial_count); 631 | self.bc7_enc_mode01237( 632 | 1, 633 | &part_list, 634 | self.settings.fast_skip_threshold_mode1 as usize, 635 | ); 636 | self.bc7_enc_mode01237( 637 | 3, 638 | &part_list, 639 | self.settings.fast_skip_threshold_mode3 as usize, 640 | ); 641 | } 642 | 643 | fn bc7_enc_mode45_candidate( 644 | &self, 645 | best_candidate: &mut Mode45Parameters, 646 | best_err: &mut f32, 647 | mode: usize, 648 | rotation: u32, 649 | swap: u32, 650 | ) { 651 | let mut bits = 2; 652 | let mut abits = 2; 653 | let mut aepbits = 8; 654 | 655 | if mode == 4 { 656 | abits = 3; 657 | aepbits = 6; 658 | } 659 | 660 | // (mode 4) 661 | if swap == 1 { 662 | bits = 3; 663 | abits = 2; 664 | } 665 | 666 | let mut candidate_block = [0.0; 64]; 667 | 668 | for k in 0..16 { 669 | for p in 0..3 { 670 | candidate_block[k + p * 16] = self.block[k + p * 16]; 671 | } 672 | 673 | if rotation < 3 { 674 | // Apply channel rotation 675 | if self.settings.channels == 4 { 676 | candidate_block[k + rotation as usize * 16] = self.block[k + 3 * 16]; 677 | } 678 | if self.settings.channels == 3 { 679 | candidate_block[k + rotation as usize * 16] = 255.0; 680 | } 681 | } 682 | } 683 | 684 | let mut ep = [0.0; 8]; 685 | Self::block_segment(&mut ep, &candidate_block, 0xFFFFFFFF, 3); 686 | 687 | let mut qep = [0; 8]; 688 | Self::ep_quant_dequant(&mut qep, &mut ep, mode, 3); 689 | 690 | let mut qblock = [0; 2]; 691 | let mut err = block_quant(&mut qblock, &candidate_block, bits, &ep, 0, 3); 692 | 693 | // Refine 694 | let refine_iterations = self.settings.refine_iterations[mode]; 695 | for _ in 0..refine_iterations { 696 | opt_endpoints(&mut ep, &candidate_block, bits, qblock, 0xFFFFFFFF, 3); 697 | Self::ep_quant_dequant(&mut qep, &mut ep, mode, 3); 698 | err = block_quant(&mut qblock, &candidate_block, bits, &ep, 0, 3); 699 | } 700 | 701 | let channel_data: [f32; 16] = 702 | std::array::from_fn(|k| self.block[k + rotation as usize * 16]); 703 | 704 | // Encoding selected channel 705 | let mut aqep = [0; 2]; 706 | let mut aqblock = [0; 2]; 707 | 708 | err += self.opt_channel(&mut aqblock, &mut aqep, &channel_data, abits, aepbits); 709 | 710 | if err < *best_err { 711 | best_candidate.qep.copy_from_slice(&qep[..8]); 712 | best_candidate.qblock.copy_from_slice(&qblock); 713 | best_candidate.aqblock.copy_from_slice(&aqblock); 714 | best_candidate.aqep.copy_from_slice(&aqep); 715 | best_candidate.rotation = rotation; 716 | best_candidate.swap = swap; 717 | *best_err = err; 718 | } 719 | } 720 | 721 | fn bc7_enc_mode45(&mut self) { 722 | let mut best_candidate = Mode45Parameters::default(); 723 | let mut best_err = self.best_err; 724 | 725 | let channel0 = self.settings.mode45_channel0; 726 | for p in channel0..self.settings.channels { 727 | self.bc7_enc_mode45_candidate(&mut best_candidate, &mut best_err, 4, p, 0); 728 | self.bc7_enc_mode45_candidate(&mut best_candidate, &mut best_err, 4, p, 1); 729 | } 730 | 731 | // Mode 4 732 | if best_err < self.best_err { 733 | self.best_err = best_err; 734 | self.bc7_code_mode45(&best_candidate, 4); 735 | } 736 | 737 | for p in channel0..self.settings.channels { 738 | self.bc7_enc_mode45_candidate(&mut best_candidate, &mut best_err, 5, p, 0); 739 | } 740 | 741 | // Mode 5 742 | if best_err < self.best_err { 743 | self.best_err = best_err; 744 | self.bc7_code_mode45(&best_candidate, 5); 745 | } 746 | } 747 | 748 | fn bc7_enc_mode6(&mut self) { 749 | const MODE: usize = 6; 750 | const BITS: u32 = 4; 751 | 752 | let mut ep = [0.0; 8]; 753 | Self::block_segment( 754 | &mut ep, 755 | &self.block, 756 | 0xFFFFFFFF, 757 | self.settings.channels as usize, 758 | ); 759 | 760 | if self.settings.channels == 3 { 761 | ep[3] = 255.0; 762 | ep[7] = 255.0; 763 | } 764 | 765 | let mut qep = [0; 8]; 766 | Self::ep_quant_dequant(&mut qep, &mut ep, MODE, self.settings.channels as usize); 767 | 768 | let mut qblock = [0; 2]; 769 | let mut err = block_quant( 770 | &mut qblock, 771 | &self.block, 772 | BITS, 773 | &ep, 774 | 0, 775 | self.settings.channels as usize, 776 | ); 777 | 778 | let refine_iterations = self.settings.refine_iterations[MODE]; 779 | for _ in 0..refine_iterations { 780 | opt_endpoints( 781 | &mut ep, 782 | &self.block, 783 | BITS, 784 | qblock, 785 | 0xFFFFFFFF, 786 | self.settings.channels as usize, 787 | ); 788 | Self::ep_quant_dequant(&mut qep, &mut ep, MODE, self.settings.channels as usize); 789 | err = block_quant( 790 | &mut qblock, 791 | &self.block, 792 | BITS, 793 | &ep, 794 | 0, 795 | self.settings.channels as usize, 796 | ); 797 | } 798 | 799 | if err < self.best_err { 800 | self.best_err = err; 801 | self.bc7_code_mode6(&mut qep, &mut qblock); 802 | } 803 | } 804 | 805 | fn bc7_enc_mode7(&mut self) { 806 | if self.settings.fast_skip_threshold_mode7 == 0 { 807 | return; 808 | } 809 | 810 | let mut full_stats = [0.0; 15]; 811 | compute_stats_masked( 812 | &mut full_stats, 813 | &self.block, 814 | 0xFFFFFFFF, 815 | self.settings.channels as usize, 816 | ); 817 | 818 | let mut part_list = [0; 64]; 819 | for part in 0..64 { 820 | let mask = get_pattern_mask(part, 0); 821 | let bound12 = block_pca_bound_split( 822 | &self.block, 823 | mask, 824 | full_stats, 825 | self.settings.channels as usize, 826 | ); 827 | let bound = bound12 as i32; 828 | part_list[part as usize] = part + bound * 64; 829 | } 830 | 831 | partial_sort_list(&mut part_list, 64, self.settings.fast_skip_threshold_mode7); 832 | self.bc7_enc_mode01237( 833 | 7, 834 | &part_list, 835 | self.settings.fast_skip_threshold_mode7 as usize, 836 | ); 837 | } 838 | 839 | pub(crate) fn compress_block_bc7_core(&mut self) { 840 | if self.settings.mode_selection[0] != 0 { 841 | self.bc7_enc_mode02(); 842 | } 843 | if self.settings.mode_selection[1] != 0 { 844 | self.bc7_enc_mode13(); 845 | self.bc7_enc_mode7(); 846 | } 847 | if self.settings.mode_selection[2] != 0 { 848 | self.bc7_enc_mode45(); 849 | } 850 | if self.settings.mode_selection[3] != 0 { 851 | self.bc7_enc_mode6(); 852 | } 853 | } 854 | 855 | pub(crate) fn compute_opaque_err(&mut self) { 856 | self.opaque_err = if self.settings.channels == 3 { 857 | 0.0 858 | } else { 859 | let mut err = 0.0; 860 | for k in 0..16 { 861 | err += sq(self.block[48 + k] - 255.0); 862 | } 863 | err 864 | }; 865 | } 866 | } 867 | --------------------------------------------------------------------------------