├── .github └── workflows │ └── check_and_test.yaml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches ├── bench_alpha.rs ├── bench_color_mapper.rs ├── bench_compare_l.rs ├── bench_compare_l16.rs ├── bench_compare_l32f.rs ├── bench_compare_la.rs ├── bench_compare_la16.rs ├── bench_compare_la32f.rs ├── bench_compare_rgb.rs ├── bench_compare_rgb16.rs ├── bench_compare_rgb32f.rs ├── bench_compare_rgba.rs ├── bench_compare_rgba16.rs ├── bench_compare_rgba32f.rs ├── bench_resize.rs ├── templates │ ├── bench_compare_l.md.tera │ ├── bench_compare_l16.md.tera │ ├── bench_compare_l32f.md.tera │ ├── bench_compare_la.md.tera │ ├── bench_compare_la16.md.tera │ ├── bench_compare_la32f.md.tera │ ├── bench_compare_rgb.md.tera │ ├── bench_compare_rgb16.md.tera │ ├── bench_compare_rgb32f.md.tera │ ├── bench_compare_rgba.md.tera │ ├── bench_compare_rgba16.md.tera │ ├── bench_compare_rgba32f.md.tera │ └── introduction.md.tera └── utils │ ├── bencher.rs │ ├── mod.rs │ ├── resize_functions.rs │ ├── results.rs │ └── testing.rs ├── benchmarks-arm64.md ├── benchmarks-wasm32.md ├── benchmarks-x86_64.md ├── data ├── crop_test.png ├── nasa-4019x4019-rgba.png ├── nasa-4019x4019.png ├── nasa-4928x3279-rgba.png ├── nasa-4928x3279.png ├── nasa-852x567-rgba.png └── nasa-852x567.png ├── dev.md ├── resizer ├── Cargo.toml └── src │ ├── main.rs │ └── structs.rs ├── rustfmt.toml ├── src ├── alpha │ ├── common.rs │ ├── errors.rs │ ├── f32x2 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── f32x4 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── mod.rs │ ├── u16x2 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u16x4 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u8x2 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ └── u8x4 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs ├── array_chunks.rs ├── change_components_type.rs ├── color │ ├── mappers.rs │ └── mod.rs ├── convolution │ ├── f32x1 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── f32x2 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── f32x3 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── f32x4 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── filters.rs │ ├── i32x1 │ │ ├── mod.rs │ │ └── native.rs │ ├── macros.rs │ ├── mod.rs │ ├── optimisations.rs │ ├── u16x1 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u16x2 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u16x3 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u16x4 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u8x1 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u8x2 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u8x3 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── u8x4 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ ├── vertical_f32 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ └── sse4.rs │ ├── vertical_u16 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs │ └── vertical_u8 │ │ ├── avx2.rs │ │ ├── mod.rs │ │ ├── native.rs │ │ ├── neon.rs │ │ ├── sse4.rs │ │ └── wasm32.rs ├── cpu_extensions.rs ├── crop_box.rs ├── errors.rs ├── image_view.rs ├── images │ ├── cropped_image.rs │ ├── image.rs │ ├── image_crate.rs │ ├── mod.rs │ ├── typed_cropped_image.rs │ ├── typed_image.rs │ └── unsafe_image.rs ├── lib.rs ├── mul_div.rs ├── neon_utils.rs ├── pixels.rs ├── resizer.rs ├── simd_utils.rs ├── testing.rs ├── threading.rs ├── utils.rs └── wasm32_utils.rs └── tests ├── alpha_tests.rs ├── color_tests.rs ├── image_view.rs ├── images_tests.rs ├── resize_tests.rs └── testing.rs /.github/workflows/check_and_test.yaml: -------------------------------------------------------------------------------- 1 | name: Check and Test 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | workflow_dispatch: { } 9 | 10 | env: 11 | CARGO_TERM_COLOR: always 12 | DONT_SAVE_RESULT: 1 13 | RAYON_NUM_THREADS: 4 14 | 15 | jobs: 16 | run_tests: 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | os: [ ubuntu-latest, macos-latest, windows-latest ] 21 | 22 | name: Test `cargo check/test` on ${{ matrix.os }} 23 | runs-on: ${{ matrix.os }} 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: Swatinem/rust-cache@v2 29 | with: 30 | cache-on-failure: "true" 31 | 32 | - name: Run single-thread tests 33 | run: | 34 | cargo check 35 | cargo test 36 | 37 | - name: Run multi-thread tests 38 | run: | 39 | cargo check --features rayon 40 | cargo test --features rayon 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .* 3 | !/.gitignore 4 | data/result 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "resizer", 4 | ] 5 | 6 | 7 | [package] 8 | name = "fast_image_resize" 9 | version = "5.1.4" 10 | authors = ["Kirill Kuzminykh "] 11 | edition = "2021" 12 | license = "MIT OR Apache-2.0" 13 | description = "Library for fast image resizing with using of SIMD instructions" 14 | readme = "README.md" 15 | keywords = ["image", "resize"] 16 | repository = "https://github.com/cykooz/fast_image_resize" 17 | documentation = "https://docs.rs/crate/fast_image_resize" 18 | exclude = ["/data", "/.github"] 19 | 20 | 21 | [dependencies] 22 | cfg-if = "1.0" 23 | num-traits = "0.2.19" 24 | thiserror = "1.0" 25 | document-features = "0.2.11" 26 | # Optional dependencies 27 | image = { version = "0.25.6", optional = true, default-features = false } 28 | bytemuck = { version = "1.23", optional = true } 29 | rayon = { version = "1.10", optional = true } 30 | 31 | 32 | [features] 33 | ## Enable this feature to implement traits [IntoImageView](crate::IntoImageView) and 34 | ## [IntoImageViewMut](crate::IntoImageViewMut) for the 35 | ## [DynamicImage](https://docs.rs/image/latest/image/enum.DynamicImage.html) 36 | ## type from the `image` crate. 37 | image = ["dep:image", "dep:bytemuck"] 38 | ## This feature enables image processing in `rayon` thread pool. 39 | rayon = ["dep:rayon", "resize/rayon", "image/rayon"] 40 | for_testing = ["image", "image/png"] 41 | only_u8x4 = [] # This can be used to experiment with the crate's code. 42 | 43 | 44 | [dev-dependencies] 45 | fast_image_resize = { path = ".", features = ["for_testing"] } 46 | resize = { version = "0.8.8", default-features = false, features = ["std"] } 47 | rgb = "0.8.50" 48 | png = "0.17.16" 49 | serde = { version = "1.0", features = ["serde_derive"] } 50 | serde_json = "1.0" 51 | walkdir = "2.5" 52 | itertools = "0.14.0" 53 | criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] } 54 | tera = "1.20" 55 | 56 | 57 | [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] 58 | nix = { version = "0.30.1", default-features = false, features = ["sched"] } 59 | 60 | 61 | [target.'cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))'.dev-dependencies] 62 | libvips = "1.7" 63 | 64 | 65 | [profile.test] 66 | opt-level = 1 67 | incremental = true 68 | 69 | 70 | # debug builds for deps 71 | [profile.dev.package.'*'] 72 | opt-level = 3 73 | 74 | 75 | # release build for procmacros - same config as debug build for procmacros 76 | [profile.release.build-override] 77 | opt-level = 2 78 | debug = false # when possible 79 | 80 | 81 | [profile.release] 82 | opt-level = 3 83 | incremental = true 84 | #lto = true 85 | #codegen-units = 1 86 | strip = true 87 | 88 | 89 | #[profile.release.package.fast_image_resize] 90 | #codegen-units = 1 91 | 92 | 93 | [profile.release.package.image] 94 | codegen-units = 1 95 | 96 | 97 | [profile.release.package.resize] 98 | codegen-units = 1 99 | 100 | 101 | [package.metadata.release] 102 | pre-release-replacements = [ 103 | { file = "CHANGELOG.md", search = "Unreleased", replace = "{{version}}" }, 104 | { file = "CHANGELOG.md", search = "ReleaseDate", replace = "{{date}}" } 105 | ] 106 | 107 | 108 | [[bench]] 109 | name = "bench_resize" 110 | harness = false 111 | 112 | 113 | [[bench]] 114 | name = "bench_alpha" 115 | harness = false 116 | 117 | 118 | [[bench]] 119 | name = "bench_compare_rgb" 120 | harness = false 121 | 122 | 123 | [[bench]] 124 | name = "bench_compare_rgb16" 125 | harness = false 126 | 127 | 128 | [[bench]] 129 | name = "bench_compare_rgb32f" 130 | harness = false 131 | 132 | 133 | [[bench]] 134 | name = "bench_compare_rgba" 135 | harness = false 136 | 137 | 138 | [[bench]] 139 | name = "bench_compare_rgba16" 140 | harness = false 141 | 142 | 143 | [[bench]] 144 | name = "bench_compare_rgba32f" 145 | harness = false 146 | 147 | 148 | [[bench]] 149 | name = "bench_compare_l" 150 | harness = false 151 | 152 | 153 | [[bench]] 154 | name = "bench_compare_la" 155 | harness = false 156 | 157 | 158 | [[bench]] 159 | name = "bench_compare_l16" 160 | harness = false 161 | 162 | 163 | [[bench]] 164 | name = "bench_compare_la16" 165 | harness = false 166 | 167 | 168 | [[bench]] 169 | name = "bench_compare_l32f" 170 | harness = false 171 | 172 | 173 | [[bench]] 174 | name = "bench_compare_la32f" 175 | harness = false 176 | 177 | 178 | [[bench]] 179 | name = "bench_color_mapper" 180 | harness = false 181 | 182 | 183 | # Header of next release in CHANGELOG.md: 184 | # ## [Unreleased] - ReleaseDate 185 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Kirill Kuzminykh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benches/bench_color_mapper.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::create_srgb_mapper; 2 | use fast_image_resize::images::Image; 3 | use fast_image_resize::pixels::U8x3; 4 | use utils::pin_process_to_cpu0; 5 | use utils::testing::PixelTestingExt; 6 | 7 | mod utils; 8 | 9 | pub fn bench_color_mapper(bench_group: &mut utils::BenchGroup) { 10 | let src_image = U8x3::load_big_src_image(); 11 | let mut dst_image = Image::new( 12 | src_image.width(), 13 | src_image.height(), 14 | src_image.pixel_type(), 15 | ); 16 | let mapper = create_srgb_mapper(); 17 | bench_group 18 | .criterion_group 19 | .bench_function("SRGB U8x3 => RGB U8x3", |bencher| { 20 | bencher.iter(|| { 21 | mapper.forward_map(&src_image, &mut dst_image).unwrap(); 22 | }) 23 | }); 24 | } 25 | 26 | fn main() { 27 | pin_process_to_cpu0(); 28 | utils::run_bench(bench_color_mapper, "Color mapper"); 29 | } 30 | -------------------------------------------------------------------------------- /benches/bench_compare_l.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U8; 2 | use resize::Pixel::Gray8; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_compare_l(bench_group: &mut utils::BenchGroup) { 9 | type P = U8; 10 | let src_image = P::load_big_image(); 11 | utils::image_resize(bench_group, &src_image); 12 | utils::resize_resize( 13 | bench_group, 14 | Gray8, 15 | src_image.as_raw().as_gray(), 16 | src_image.width(), 17 | src_image.height(), 18 | ); 19 | utils::libvips_resize::

(bench_group, false); 20 | utils::fir_resize::

(bench_group, false); 21 | } 22 | 23 | fn main() { 24 | let res = utils::run_bench(bench_compare_l, "Compare resize of U8 image"); 25 | utils::print_and_write_compare_result(&res); 26 | } 27 | -------------------------------------------------------------------------------- /benches/bench_compare_l16.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U16; 2 | use resize::Pixel::Gray16; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_l16(bench_group: &mut utils::BenchGroup) { 9 | type P = U16; 10 | let src_image = P::load_big_image(); 11 | utils::image_resize(bench_group, &src_image); 12 | utils::resize_resize( 13 | bench_group, 14 | Gray16, 15 | src_image.as_raw().as_gray(), 16 | src_image.width(), 17 | src_image.height(), 18 | ); 19 | utils::libvips_resize::

(bench_group, false); 20 | utils::fir_resize::

(bench_group, false); 21 | } 22 | 23 | fn main() { 24 | let res = utils::run_bench(bench_downscale_l16, "Compare resize of U16 image"); 25 | utils::print_and_write_compare_result(&res); 26 | } 27 | -------------------------------------------------------------------------------- /benches/bench_compare_l32f.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::F32; 2 | use resize::Pixel::GrayF32; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_l32f(bench_group: &mut utils::BenchGroup) { 9 | type P = F32; 10 | let src_image = P::load_big_image(); 11 | utils::image_resize(bench_group, &src_image); 12 | utils::resize_resize( 13 | bench_group, 14 | GrayF32, 15 | src_image.as_raw().as_gray(), 16 | src_image.width(), 17 | src_image.height(), 18 | ); 19 | utils::libvips_resize::

(bench_group, false); 20 | utils::fir_resize::

(bench_group, false); 21 | } 22 | 23 | fn main() { 24 | let res = utils::run_bench(bench_downscale_l32f, "Compare resize of L32F image"); 25 | utils::print_and_write_compare_result(&res); 26 | } 27 | -------------------------------------------------------------------------------- /benches/bench_compare_la.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U8x2; 2 | 3 | mod utils; 4 | 5 | pub fn bench_downscale_la(bench_group: &mut utils::BenchGroup) { 6 | type P = U8x2; 7 | utils::libvips_resize::

(bench_group, true); 8 | utils::fir_resize::

(bench_group, true); 9 | } 10 | 11 | fn main() { 12 | let res = utils::run_bench(bench_downscale_la, "Compare resize of LA image"); 13 | utils::print_and_write_compare_result(&res); 14 | } 15 | -------------------------------------------------------------------------------- /benches/bench_compare_la16.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U16x2; 2 | 3 | mod utils; 4 | 5 | pub fn bench_downscale_la16(bench_group: &mut utils::BenchGroup) { 6 | type P = U16x2; 7 | utils::libvips_resize::

(bench_group, true); 8 | utils::fir_resize::

(bench_group, true); 9 | } 10 | 11 | fn main() { 12 | let res = utils::run_bench(bench_downscale_la16, "Compare resize of LA16 image"); 13 | utils::print_and_write_compare_result(&res); 14 | } 15 | -------------------------------------------------------------------------------- /benches/bench_compare_la32f.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::F32x2; 2 | 3 | mod utils; 4 | 5 | pub fn bench_downscale_la32f(bench_group: &mut utils::BenchGroup) { 6 | type P = F32x2; 7 | utils::libvips_resize::

(bench_group, true); 8 | utils::fir_resize::

(bench_group, true); 9 | } 10 | 11 | fn main() { 12 | let res = utils::run_bench(bench_downscale_la32f, "Compare resize of LA32F image"); 13 | utils::print_and_write_compare_result(&res); 14 | } 15 | -------------------------------------------------------------------------------- /benches/bench_compare_rgb.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U8x3; 2 | use resize::Pixel::RGB8; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_rgb(bench_group: &mut utils::BenchGroup) { 9 | type P = U8x3; 10 | let src_image = P::load_big_image(); 11 | utils::image_resize(bench_group, &src_image); 12 | utils::resize_resize( 13 | bench_group, 14 | RGB8, 15 | src_image.as_raw().as_rgb(), 16 | src_image.width(), 17 | src_image.height(), 18 | ); 19 | utils::libvips_resize::

(bench_group, false); 20 | utils::fir_resize::

(bench_group, false); 21 | } 22 | 23 | fn main() { 24 | let res = utils::run_bench(bench_downscale_rgb, "Compare resize of RGB image"); 25 | utils::print_and_write_compare_result(&res); 26 | } 27 | -------------------------------------------------------------------------------- /benches/bench_compare_rgb16.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U16x3; 2 | use resize::Pixel::RGB16; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_rgb16(bench_group: &mut utils::BenchGroup) { 9 | type P = U16x3; 10 | let src_image = P::load_big_image(); 11 | utils::image_resize(bench_group, &src_image); 12 | utils::resize_resize( 13 | bench_group, 14 | RGB16, 15 | src_image.as_raw().as_rgb(), 16 | src_image.width(), 17 | src_image.height(), 18 | ); 19 | utils::libvips_resize::

(bench_group, false); 20 | utils::fir_resize::

(bench_group, false); 21 | } 22 | 23 | fn main() { 24 | let res = utils::run_bench(bench_downscale_rgb16, "Compare resize of RGB16 image"); 25 | utils::print_and_write_compare_result(&res); 26 | } 27 | -------------------------------------------------------------------------------- /benches/bench_compare_rgb32f.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::F32x3; 2 | use resize::Pixel::RGBF32; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_rgb32f(bench_group: &mut utils::BenchGroup) { 9 | type P = F32x3; 10 | let src_image = P::load_big_image(); 11 | utils::image_resize(bench_group, &src_image); 12 | utils::resize_resize( 13 | bench_group, 14 | RGBF32, 15 | src_image.as_raw().as_rgb(), 16 | src_image.width(), 17 | src_image.height(), 18 | ); 19 | utils::libvips_resize::

(bench_group, false); 20 | utils::fir_resize::

(bench_group, false); 21 | } 22 | 23 | fn main() { 24 | let res = utils::run_bench(bench_downscale_rgb32f, "Compare resize of RGB32F image"); 25 | utils::print_and_write_compare_result(&res); 26 | } 27 | -------------------------------------------------------------------------------- /benches/bench_compare_rgba.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U8x4; 2 | use resize::Pixel::RGBA8P; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_rgba(bench_group: &mut utils::BenchGroup) { 9 | type P = U8x4; 10 | let src_image = P::load_big_image(); 11 | utils::resize_resize( 12 | bench_group, 13 | RGBA8P, 14 | src_image.as_raw().as_rgba(), 15 | src_image.width(), 16 | src_image.height(), 17 | ); 18 | utils::libvips_resize::

(bench_group, true); 19 | utils::fir_resize::

(bench_group, true); 20 | } 21 | 22 | fn main() { 23 | let res = utils::run_bench(bench_downscale_rgba, "Compare resize of RGBA image"); 24 | utils::print_and_write_compare_result(&res); 25 | } 26 | -------------------------------------------------------------------------------- /benches/bench_compare_rgba16.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::U16x4; 2 | use resize::Pixel::RGBA16P; 3 | use rgb::FromSlice; 4 | use utils::testing::PixelTestingExt; 5 | 6 | mod utils; 7 | 8 | pub fn bench_downscale_rgba16(bench_group: &mut utils::BenchGroup) { 9 | type P = U16x4; 10 | let src_image = P::load_big_image(); 11 | utils::resize_resize( 12 | bench_group, 13 | RGBA16P, 14 | src_image.as_raw().as_rgba(), 15 | src_image.width(), 16 | src_image.height(), 17 | ); 18 | utils::libvips_resize::

(bench_group, true); 19 | utils::fir_resize::

(bench_group, true); 20 | } 21 | 22 | fn main() { 23 | let res = utils::run_bench(bench_downscale_rgba16, "Compare resize of RGBA16 image"); 24 | utils::print_and_write_compare_result(&res); 25 | } 26 | -------------------------------------------------------------------------------- /benches/bench_compare_rgba32f.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::pixels::F32x4; 2 | 3 | mod utils; 4 | 5 | pub fn bench_downscale_rgba32f(bench_group: &mut utils::BenchGroup) { 6 | type P = F32x4; 7 | utils::libvips_resize::

(bench_group, true); 8 | utils::fir_resize::

(bench_group, true); 9 | } 10 | 11 | fn main() { 12 | let res = utils::run_bench(bench_downscale_rgba32f, "Compare resize of RGBA32F image"); 13 | utils::print_and_write_compare_result(&res); 14 | } 15 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_l.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize L8 image (U8) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => resize => dst_image` 6 | 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png) 8 | has converted into grayscale image with one byte per pixel. 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | 11 | {{ compare_results -}} 12 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_l16.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize L16 image (U16) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => resize => dst_image` 6 | 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png) 8 | has converted into grayscale image with two bytes per pixel. 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | 11 | {{ compare_results -}} 12 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_l32f.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize L32F image (F32) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => resize => dst_image` 6 | 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png) 8 | has converted into grayscale image with two bytes per pixel. 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | 11 | {{ compare_results -}} 12 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_la.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize LA8 image (U8x2) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image` 6 | 7 | - Source image 8 | [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png) 9 | has converted into grayscale image with an alpha channel (two bytes per pixel). 10 | - Numbers in the table mean a duration of image resizing in milliseconds. 11 | - The `image` crate does not support multiplying and dividing by alpha channel. 12 | - The `resize` crate does not support this pixel format. 13 | 14 | {{ compare_results -}} 15 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_la16.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize LA16 (luma with alpha channel) image (U16x2) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image` 6 | 7 | - Source image 8 | [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png) 9 | has converted into grayscale image with an alpha channel (four bytes per pixel). 10 | - Numbers in the table mean a duration of image resizing in milliseconds. 11 | - The `image` crate does not support multiplying and dividing by alpha channel. 12 | - The `resize` crate does not support this pixel format. 13 | 14 | {{ compare_results -}} 15 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_la32f.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize LA32F (luma with alpha channel) image (F32x2) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image` 6 | 7 | - Source image 8 | [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png) 9 | has converted into grayscale image with an alpha channel (two `f32` values per pixel). 10 | - Numbers in the table mean a duration of image resizing in milliseconds. 11 | - The `image` crate does not support multiplying and dividing by alpha channel. 12 | - The `resize` crate does not support this pixel format. 13 | 14 | {{ compare_results -}} 15 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_rgb.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize RGB8 image (U8x3) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => resize => dst_image` 6 | 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png) 8 | - Numbers in the table mean a duration of image resizing in milliseconds. 9 | 10 | {{ compare_results -}} 11 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_rgb16.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize RGB16 image (U16x3) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => resize => dst_image` 6 | 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png) 8 | has converted into RGB16 image. 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | 11 | {{ compare_results -}} 12 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_rgb32f.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize RGB32F image (F32x3) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => resize => dst_image` 6 | 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png) 8 | has converted into RGB32F image. 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | 11 | {{ compare_results -}} 12 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_rgba.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize RGBA8 image (U8x4) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image` 6 | 7 | - Source image 8 | [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png) 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | - The `image` crate does not support multiplying and dividing by alpha channel. 11 | 12 | {{ compare_results -}} 13 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_rgba16.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize RGBA16 image (U16x4) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image` 6 | 7 | - Source image 8 | [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png) 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | - The `image` crate does not support multiplying and dividing by alpha channel. 11 | 12 | {{ compare_results -}} 13 | -------------------------------------------------------------------------------- /benches/templates/bench_compare_rgba32f.md.tera: -------------------------------------------------------------------------------- 1 | ### Resize RGBA32F image (F32x4) 4928x3279 => 852x567 2 | 3 | Pipeline: 4 | 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image` 6 | 7 | - Source image 8 | [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png) 9 | - Numbers in the table mean a duration of image resizing in milliseconds. 10 | - The `image` crate does not support multiplying and dividing by alpha channel. 11 | - The `resize` crate does not support multiplying and dividing by alpha channel 12 | for this pixel format. 13 | 14 | {{ compare_results -}} 15 | -------------------------------------------------------------------------------- /benches/templates/introduction.md.tera: -------------------------------------------------------------------------------- 1 | ## Benchmarks of fast_image_resize crate for {{ arch_name }} architecture 2 | 3 | Environment: 4 | 5 | {% if arch_id == "arm64" -%} 6 | - CPU: Neoverse-N1 2GHz (Oracle Cloud Compute, VM.Standard.A1.Flex) 7 | {% else -%} 8 | - CPU: AMD Ryzen 9 5950X 9 | - RAM: DDR4 4000 MHz 10 | {% endif -%} 11 | - Ubuntu 24.04 (linux 6.11.0) 12 | - Rust 1.87.0 13 | - criterion = "0.5.1" 14 | - fast_image_resize = "5.1.4" 15 | {% if arch_id == "wasm32" -%} 16 | - wasmtime = "32.0.0" 17 | {% endif %} 18 | 19 | Other libraries used to compare of resizing speed: 20 | 21 | - image = "0.25.6" () 22 | - resize = "0.8.8" (, single-threaded mode) 23 | {% if arch_id != "wasm32" -%} 24 | - libvips = "8.15.1" (single-threaded mode) 25 | {% endif %} 26 | 27 | Resize algorithms: 28 | 29 | - Nearest 30 | - Box - convolution with minimal kernel size 1x1 px 31 | - Bilinear - convolution with minimal kernel size 2x2 px 32 | - Bicubic (CatmullRom) - convolution with minimal kernel size 4x4 px 33 | - Lanczos3 - convolution with minimal kernel size 6x6 px 34 | -------------------------------------------------------------------------------- /benches/utils/bencher.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | use std::time::{Duration, SystemTime}; 4 | 5 | use criterion::measurement::WallTime; 6 | use criterion::{Bencher, BenchmarkGroup, BenchmarkId, Criterion}; 7 | 8 | use super::{cargo_target_directory, get_arch_id_and_name, get_results, BenchResult}; 9 | 10 | pub struct BenchGroup<'a> { 11 | pub criterion_group: BenchmarkGroup<'a, WallTime>, 12 | old_results: Vec, 13 | results: Vec, 14 | } 15 | 16 | impl<'a> BenchGroup<'a> { 17 | fn finish(self) -> Vec { 18 | self.criterion_group.finish(); 19 | self.results 20 | } 21 | } 22 | 23 | pub fn run_bench(bench_fn: F, name: &str) -> Vec 24 | where 25 | F: FnOnce(&mut BenchGroup), 26 | { 27 | if env::var("PIN_TO_CPU0").is_ok() { 28 | pin_process_to_cpu0(); 29 | } 30 | 31 | let arch_id = get_arch_id_and_name().0; 32 | let output_dir = criterion_output_directory().join(arch_id); 33 | let mut criterion = Criterion::default() 34 | .output_directory(&output_dir) 35 | .configure_from_args(); 36 | 37 | let now = SystemTime::now(); 38 | let results_dir = output_dir.join(name); 39 | 40 | let results_lifetime: u32 = env::var("RESULTS_LIFETIME") 41 | .unwrap_or_else(|_| "0".to_owned()) 42 | .parse() 43 | .unwrap_or_default(); 44 | let old_results = if results_lifetime > 0 && name.starts_with("Compare ") { 45 | let old_now = now - Duration::from_secs(results_lifetime as u64 * 24 * 3600); 46 | get_results(&results_dir, &old_now) 47 | } else { 48 | vec![] 49 | }; 50 | 51 | let mut group = BenchGroup { 52 | criterion_group: criterion.benchmark_group(name), 53 | old_results, 54 | results: vec![], 55 | }; 56 | bench_fn(&mut group); 57 | let mut results = group.finish(); 58 | criterion.final_summary(); 59 | 60 | let new_results = get_results(&results_dir, &now); 61 | if new_results.is_empty() { 62 | new_results 63 | } else { 64 | for res in results.iter_mut().filter(|r| r.estimate < 0.) { 65 | res.estimate = new_results 66 | .iter() 67 | .find(|new_res| { 68 | new_res.function_name == res.function_name && new_res.parameter == res.parameter 69 | }) 70 | .map(|r| r.estimate) 71 | .unwrap_or(0.) 72 | } 73 | results 74 | } 75 | } 76 | 77 | pub fn bench( 78 | group: &mut BenchGroup, 79 | sample_size: usize, 80 | func_name: S1, 81 | parameter: S2, 82 | mut f: F, 83 | ) where 84 | S1: Into, 85 | S2: Into, 86 | F: FnMut(&mut Bencher), 87 | { 88 | let parameter = parameter.into(); 89 | let func_name = func_name.into(); 90 | // Use old results only for other libraries, not for 'fast_image_resize' 91 | if !func_name.starts_with("fir ") { 92 | if let Some(old_res) = group 93 | .old_results 94 | .iter() 95 | .find(|res| res.function_name == func_name && res.parameter == parameter) 96 | { 97 | group.results.push(old_res.clone()); 98 | println!( 99 | "SKIP benching of '{}' function with '{}' parameter due to using old result.", 100 | func_name, parameter 101 | ); 102 | return; 103 | } 104 | } 105 | 106 | group.results.push(BenchResult { 107 | function_name: func_name.clone(), 108 | parameter: parameter.clone(), 109 | estimate: -1., // Unknown result 110 | }); 111 | 112 | group.criterion_group.sample_size(sample_size); 113 | group.criterion_group.bench_with_input( 114 | BenchmarkId::new(func_name, ¶meter), 115 | ¶meter, 116 | |bencher, _| f(bencher), 117 | ); 118 | } 119 | 120 | /// Pin process to #0 CPU core 121 | pub fn pin_process_to_cpu0() { 122 | #[cfg(not(target_arch = "wasm32"))] 123 | { 124 | let mut cpu_set = nix::sched::CpuSet::new(); 125 | cpu_set.set(0).unwrap(); 126 | nix::sched::sched_setaffinity(nix::unistd::Pid::from_raw(0), &cpu_set).unwrap(); 127 | } 128 | } 129 | 130 | fn criterion_output_directory() -> PathBuf { 131 | if let Some(value) = env::var_os("CRITERION_HOME") { 132 | PathBuf::from(value) 133 | } else if let Some(path) = cargo_target_directory() { 134 | path.join("criterion") 135 | } else { 136 | PathBuf::from("target/criterion") 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /benches/utils/mod.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | use std::process::Command; 4 | 5 | pub use bencher::*; 6 | pub use resize_functions::*; 7 | pub use results::*; 8 | use serde::Deserialize; 9 | 10 | mod bencher; 11 | mod resize_functions; 12 | mod results; 13 | pub mod testing; 14 | 15 | const fn get_arch_id_and_name() -> (&'static str, &'static str) { 16 | #[cfg(target_arch = "x86_64")] 17 | return ("x86_64", "x86_64"); 18 | #[cfg(target_arch = "aarch64")] 19 | return ("arm64", "arm64"); 20 | #[cfg(target_arch = "wasm32")] 21 | return ("wasm32", "Wasm32"); 22 | #[cfg(not(any( 23 | target_arch = "x86_64", 24 | target_arch = "aarch64", 25 | target_arch = "wasm32" 26 | )))] 27 | return ("unknown", "Unknown"); 28 | } 29 | 30 | /// Returns the Cargo target directory, possibly calling `cargo metadata` to 31 | /// figure it out. 32 | fn cargo_target_directory() -> Option { 33 | #[derive(Deserialize)] 34 | struct Metadata { 35 | target_directory: PathBuf, 36 | } 37 | 38 | env::var_os("CARGO_TARGET_DIR") 39 | .map(PathBuf::from) 40 | .or_else(|| { 41 | let output = Command::new(env::var_os("CARGO")?) 42 | .args(["metadata", "--format-version", "1"]) 43 | .output() 44 | .ok()?; 45 | let metadata: Metadata = serde_json::from_slice(&output.stdout).ok()?; 46 | Some(metadata.target_directory) 47 | }) 48 | } 49 | -------------------------------------------------------------------------------- /benches/utils/testing.rs: -------------------------------------------------------------------------------- 1 | ../../tests/testing.rs -------------------------------------------------------------------------------- /data/crop_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/crop_test.png -------------------------------------------------------------------------------- /data/nasa-4019x4019-rgba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4019x4019-rgba.png -------------------------------------------------------------------------------- /data/nasa-4019x4019.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4019x4019.png -------------------------------------------------------------------------------- /data/nasa-4928x3279-rgba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4928x3279-rgba.png -------------------------------------------------------------------------------- /data/nasa-4928x3279.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4928x3279.png -------------------------------------------------------------------------------- /data/nasa-852x567-rgba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-852x567-rgba.png -------------------------------------------------------------------------------- /data/nasa-852x567.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-852x567.png -------------------------------------------------------------------------------- /dev.md: -------------------------------------------------------------------------------- 1 | # Preparation 2 | 3 | Install system libraries: 4 | 5 | - libvips-dev (used in benchmarks) 6 | 7 | Install additional toolchains: 8 | 9 | - Arm64: 10 | ```shell 11 | rustup target add aarch64-unknown-linux-gnu 12 | ``` 13 | - Wasm32: 14 | ```shell 15 | rustup target add wasm32-wasip2 16 | ``` 17 | Install [Wasmtime](https://wasmtime.dev/). 18 | 19 | # Tests 20 | 21 | Run tests with saving result images as files in `./data` directory: 22 | 23 | ```shell 24 | SAVE_RESULT=1 cargo test 25 | ``` 26 | 27 | # Benchmarks 28 | 29 | Run benchmarks to compare with other crates for image resizing and write results into 30 | report files, such as `./benchmarks-x86_64.md`: 31 | 32 | ```shell 33 | WRITE_COMPARE_RESULT=1 cargo bench -- Compare 34 | ``` 35 | 36 | If you want to use old benchmark results for other crates, you must add 37 | an env variable with the number of days as a result lifetime: 38 | 39 | ```shell 40 | WRITE_COMPARE_RESULT=1 RESULTS_LIFETIME=5 cargo bench -- Compare 41 | ``` 42 | 43 | # Wasm32 44 | 45 | Specify build target and runner in `.cargo/config.toml` file. 46 | 47 | ```toml 48 | [build] 49 | target = "wasm32-wasip2" 50 | 51 | [target.wasm32-wasip2] 52 | runner = "wasmtime --dir=. --" 53 | ``` 54 | 55 | Run tests: 56 | 57 | ```shell 58 | cargo test 59 | ``` 60 | 61 | Run tests with saving result images as files in `./data` directory: 62 | 63 | ```shell 64 | CARGO_TARGET_WASM32_WASIP2_RUNNER="wasmtime --dir=. --env SAVE_RESULT=1 --" cargo test 65 | ``` 66 | 67 | Run a specific benchmark in `quick` mode: 68 | 69 | ```shell 70 | cargo bench --bench bench_resize -- --color=always --quick 71 | ``` 72 | 73 | Run benchmarks to compare with other crates for image resizing and write results into 74 | report files, such as `./benchmarks-wasm32.md`: 75 | 76 | ```shell 77 | CARGO_TARGET_WASM32_WASIP2_RUNNER="wasmtime --dir=. --env WRITE_COMPARE_RESULT=1 --" cargo bench --no-fail-fast -- --color=always Compare 78 | ``` 79 | -------------------------------------------------------------------------------- /resizer/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "resizer" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | 7 | [dependencies] 8 | fast_image_resize = { path = "..", features = ["image"] } 9 | image = "0.25.2" 10 | clap = { version = "4.5", features = ["derive"] } 11 | log = "0.4.22" 12 | env_logger = "0.11.3" 13 | anyhow = "1.0" 14 | clap-verbosity-flag = "2.2" 15 | once_cell = "1.19" 16 | 17 | 18 | [package.metadata.release] 19 | publish = false 20 | -------------------------------------------------------------------------------- /resizer/src/structs.rs: -------------------------------------------------------------------------------- 1 | use std::num::ParseIntError; 2 | use std::str::FromStr; 3 | 4 | use fast_image_resize as fr; 5 | 6 | #[derive(Copy, Clone, Debug)] 7 | pub enum Size { 8 | Pixels(u32), 9 | Percent(u16), 10 | } 11 | 12 | impl Size { 13 | pub fn calculate_size(&self, src_size: u32) -> u32 { 14 | match *self { 15 | Self::Pixels(size) => size, 16 | Self::Percent(percent) => (src_size as f32 * percent as f32 / 100.).round() as u32, 17 | } 18 | } 19 | } 20 | 21 | impl FromStr for Size { 22 | type Err = ParseIntError; 23 | 24 | fn from_str(s: &str) -> Result { 25 | if let Some(percent_str) = s.strip_suffix('%') { 26 | u16::from_str(percent_str).map(Self::Percent) 27 | } else { 28 | u32::from_str(s).map(Self::Pixels) 29 | } 30 | } 31 | } 32 | 33 | #[derive(Copy, Clone, Debug, clap::ValueEnum)] 34 | pub enum Algorithm { 35 | Nearest, 36 | Convolution, 37 | SuperSampling, 38 | } 39 | 40 | #[derive(Copy, Clone, Debug, clap::ValueEnum)] 41 | pub enum FilterType { 42 | /// Each pixel of source image contributes to one pixel of the 43 | /// destination image with identical weights. For upscaling is equivalent 44 | /// of `Nearest` resize algorithm. 45 | Box, 46 | /// Bilinear filter calculate the output pixel value using linear 47 | /// interpolation on all pixels that may contribute to the output value. 48 | Bilinear, 49 | /// Hamming filter has the same performance as `Bilinear` filter while 50 | /// providing the image downscaling quality comparable to bicubic 51 | /// (`CatmulRom` or `Mitchell`). Produces a sharper image than `Bilinear`, 52 | /// doesn't have dislocations on local level like with `Box`. 53 | /// The filter don’t show good quality for the image upscaling. 54 | Hamming, 55 | /// Catmull-Rom bicubic filter calculate the output pixel value using 56 | /// cubic interpolation on all pixels that may contribute to the output 57 | /// value. 58 | CatmullRom, 59 | /// Mitchell–Netravali bicubic filter calculate the output pixel value 60 | /// using cubic interpolation on all pixels that may contribute to the 61 | /// output value. 62 | Mitchell, 63 | /// Lanczos3 filter calculate the output pixel value using a high-quality 64 | /// Lanczos filter (a truncated sinc) on all pixels that may contribute 65 | /// to the output value. 66 | Lanczos3, 67 | } 68 | 69 | impl From for fr::FilterType { 70 | fn from(filter_type: FilterType) -> Self { 71 | match filter_type { 72 | FilterType::Box => fr::FilterType::Box, 73 | FilterType::Bilinear => fr::FilterType::Bilinear, 74 | FilterType::Hamming => fr::FilterType::Hamming, 75 | FilterType::CatmullRom => fr::FilterType::CatmullRom, 76 | FilterType::Mitchell => fr::FilterType::Mitchell, 77 | FilterType::Lanczos3 => fr::FilterType::Lanczos3, 78 | } 79 | } 80 | } 81 | 82 | #[derive(Copy, Clone, Debug, clap::ValueEnum)] 83 | pub enum ColorSpace { 84 | Linear, 85 | /// sRGB for color images or gamma 2.2 for grayscale images 86 | NonLinear, 87 | } 88 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | unstable_features = true 2 | 3 | imports_granularity = "Module" 4 | group_imports = "StdExternalCrate" 5 | -------------------------------------------------------------------------------- /src/alpha/errors.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | use crate::ImageError; 4 | 5 | #[derive(Error, Debug, Clone, Copy)] 6 | #[non_exhaustive] 7 | pub enum MulDivImagesError { 8 | #[error("Source or destination image is not supported")] 9 | ImageError(#[from] ImageError), 10 | #[error("Size of source image does not match to destination image")] 11 | SizeIsDifferent, 12 | #[error("Pixel type of source image does not match to destination image")] 13 | PixelTypesAreDifferent, 14 | } 15 | -------------------------------------------------------------------------------- /src/alpha/f32x2/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::cpu_extensions::CpuExtensions; 2 | use crate::pixels::F32x2; 3 | use crate::{ImageError, ImageView, ImageViewMut}; 4 | 5 | use super::AlphaMulDiv; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "x86_64")] 11 | mod sse4; 12 | 13 | type P = F32x2; 14 | 15 | impl AlphaMulDiv for P { 16 | fn multiply_alpha( 17 | src_view: &impl ImageView, 18 | dst_view: &mut impl ImageViewMut, 19 | cpu_extensions: CpuExtensions, 20 | ) -> Result<(), ImageError> { 21 | process_two_images! { 22 | multiple(src_view, dst_view, cpu_extensions); 23 | } 24 | Ok(()) 25 | } 26 | 27 | fn multiply_alpha_inplace( 28 | image_view: &mut impl ImageViewMut, 29 | cpu_extensions: CpuExtensions, 30 | ) -> Result<(), ImageError> { 31 | process_one_images! { 32 | multiply_inplace(image_view, cpu_extensions); 33 | } 34 | Ok(()) 35 | } 36 | 37 | fn divide_alpha( 38 | src_view: &impl ImageView, 39 | dst_view: &mut impl ImageViewMut, 40 | cpu_extensions: CpuExtensions, 41 | ) -> Result<(), ImageError> { 42 | process_two_images! { 43 | divide(src_view, dst_view, cpu_extensions); 44 | } 45 | Ok(()) 46 | } 47 | 48 | fn divide_alpha_inplace( 49 | image_view: &mut impl ImageViewMut, 50 | cpu_extensions: CpuExtensions, 51 | ) -> Result<(), ImageError> { 52 | process_one_images! { 53 | divide_inplace(image_view, cpu_extensions); 54 | } 55 | Ok(()) 56 | } 57 | } 58 | 59 | fn multiple( 60 | src_view: &impl ImageView, 61 | dst_view: &mut impl ImageViewMut, 62 | cpu_extensions: CpuExtensions, 63 | ) { 64 | match cpu_extensions { 65 | #[cfg(target_arch = "x86_64")] 66 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) }, 67 | #[cfg(target_arch = "x86_64")] 68 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) }, 69 | // #[cfg(target_arch = "aarch64")] 70 | // CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) }, 71 | // #[cfg(target_arch = "wasm32")] 72 | // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) }, 73 | _ => native::multiply_alpha(src_view, dst_view), 74 | } 75 | } 76 | 77 | fn multiply_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 78 | match cpu_extensions { 79 | #[cfg(target_arch = "x86_64")] 80 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) }, 81 | #[cfg(target_arch = "x86_64")] 82 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) }, 83 | // #[cfg(target_arch = "aarch64")] 84 | // CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) }, 85 | // #[cfg(target_arch = "wasm32")] 86 | // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) }, 87 | _ => native::multiply_alpha_inplace(image_view), 88 | } 89 | } 90 | 91 | fn divide( 92 | src_view: &impl ImageView, 93 | dst_view: &mut impl ImageViewMut, 94 | cpu_extensions: CpuExtensions, 95 | ) { 96 | match cpu_extensions { 97 | #[cfg(target_arch = "x86_64")] 98 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) }, 99 | #[cfg(target_arch = "x86_64")] 100 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) }, 101 | // #[cfg(target_arch = "aarch64")] 102 | // CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) }, 103 | // #[cfg(target_arch = "wasm32")] 104 | // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) }, 105 | _ => native::divide_alpha(src_view, dst_view), 106 | } 107 | } 108 | 109 | fn divide_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 110 | match cpu_extensions { 111 | #[cfg(target_arch = "x86_64")] 112 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) }, 113 | #[cfg(target_arch = "x86_64")] 114 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) }, 115 | // #[cfg(target_arch = "aarch64")] 116 | // CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) }, 117 | // #[cfg(target_arch = "wasm32")] 118 | // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) }, 119 | _ => native::divide_alpha_inplace(image_view), 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/alpha/f32x2/native.rs: -------------------------------------------------------------------------------- 1 | use num_traits::Zero; 2 | 3 | use crate::pixels::F32x2; 4 | use crate::utils::foreach_with_pre_reading; 5 | use crate::{ImageView, ImageViewMut}; 6 | 7 | pub(crate) fn multiply_alpha( 8 | src_view: &impl ImageView, 9 | dst_view: &mut impl ImageViewMut, 10 | ) { 11 | let src_rows = src_view.iter_rows(0); 12 | let dst_rows = dst_view.iter_rows_mut(0); 13 | 14 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 15 | multiply_alpha_row(src_row, dst_row); 16 | } 17 | } 18 | 19 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut) { 20 | for row in image_view.iter_rows_mut(0) { 21 | multiply_alpha_row_inplace(row); 22 | } 23 | } 24 | 25 | #[inline(always)] 26 | pub(crate) fn multiply_alpha_row(src_row: &[F32x2], dst_row: &mut [F32x2]) { 27 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 28 | let components: [f32; 2] = src_pixel.0; 29 | let alpha = components[1]; 30 | dst_pixel.0 = [components[0] * alpha, alpha]; 31 | } 32 | } 33 | 34 | #[inline(always)] 35 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [F32x2]) { 36 | for pixel in row { 37 | pixel.0[0] *= pixel.0[1]; 38 | } 39 | } 40 | 41 | // Divide 42 | 43 | #[inline] 44 | pub(crate) fn divide_alpha( 45 | src_view: &impl ImageView, 46 | dst_view: &mut impl ImageViewMut, 47 | ) { 48 | let src_rows = src_view.iter_rows(0); 49 | let dst_rows = dst_view.iter_rows_mut(0); 50 | 51 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 52 | divide_alpha_row(src_row, dst_row); 53 | } 54 | } 55 | 56 | #[inline] 57 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut) { 58 | for row in image_view.iter_rows_mut(0) { 59 | divide_alpha_row_inplace(row); 60 | } 61 | } 62 | 63 | #[inline(always)] 64 | pub(crate) fn divide_alpha_row(src_row: &[F32x2], dst_row: &mut [F32x2]) { 65 | foreach_with_pre_reading( 66 | src_row.iter().zip(dst_row), 67 | |(&src_pixel, dst_pixel)| (src_pixel, dst_pixel), 68 | |(src_pixel, dst_pixel)| { 69 | let alpha = src_pixel.0[1]; 70 | if alpha.is_zero() { 71 | dst_pixel.0 = [0.; 2]; 72 | } else { 73 | dst_pixel.0 = [src_pixel.0[0] / alpha, alpha]; 74 | } 75 | }, 76 | ); 77 | } 78 | 79 | #[inline(always)] 80 | pub(crate) fn divide_alpha_row_inplace(row: &mut [F32x2]) { 81 | for pixel in row { 82 | let components: [f32; 2] = pixel.0; 83 | let alpha = components[1]; 84 | if alpha.is_zero() { 85 | pixel.0[0] = 0.; 86 | } else { 87 | pixel.0[0] = components[0] / alpha; 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/alpha/f32x4/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::cpu_extensions::CpuExtensions; 2 | use crate::pixels::F32x4; 3 | use crate::{ImageError, ImageView, ImageViewMut}; 4 | 5 | use super::AlphaMulDiv; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "x86_64")] 11 | mod sse4; 12 | 13 | type P = F32x4; 14 | 15 | impl AlphaMulDiv for P { 16 | fn multiply_alpha( 17 | src_view: &impl ImageView, 18 | dst_view: &mut impl ImageViewMut, 19 | cpu_extensions: CpuExtensions, 20 | ) -> Result<(), ImageError> { 21 | process_two_images! { 22 | multiple(src_view, dst_view, cpu_extensions); 23 | } 24 | Ok(()) 25 | } 26 | 27 | fn multiply_alpha_inplace( 28 | image_view: &mut impl ImageViewMut, 29 | cpu_extensions: CpuExtensions, 30 | ) -> Result<(), ImageError> { 31 | process_one_images! { 32 | multiply_inplace(image_view, cpu_extensions); 33 | } 34 | Ok(()) 35 | } 36 | 37 | fn divide_alpha( 38 | src_view: &impl ImageView, 39 | dst_view: &mut impl ImageViewMut, 40 | cpu_extensions: CpuExtensions, 41 | ) -> Result<(), ImageError> { 42 | process_two_images! { 43 | divide(src_view, dst_view, cpu_extensions); 44 | } 45 | Ok(()) 46 | } 47 | 48 | fn divide_alpha_inplace( 49 | image_view: &mut impl ImageViewMut, 50 | cpu_extensions: CpuExtensions, 51 | ) -> Result<(), ImageError> { 52 | process_one_images! { 53 | divide_inplace(image_view, cpu_extensions); 54 | } 55 | Ok(()) 56 | } 57 | } 58 | 59 | fn multiple( 60 | src_view: &impl ImageView, 61 | dst_view: &mut impl ImageViewMut, 62 | cpu_extensions: CpuExtensions, 63 | ) { 64 | match cpu_extensions { 65 | #[cfg(target_arch = "x86_64")] 66 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) }, 67 | #[cfg(target_arch = "x86_64")] 68 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) }, 69 | // #[cfg(target_arch = "aarch64")] 70 | // CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) }, 71 | // #[cfg(target_arch = "wasm32")] 72 | // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) }, 73 | _ => native::multiply_alpha(src_view, dst_view), 74 | } 75 | } 76 | 77 | fn multiply_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 78 | match cpu_extensions { 79 | #[cfg(target_arch = "x86_64")] 80 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) }, 81 | #[cfg(target_arch = "x86_64")] 82 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) }, 83 | // #[cfg(target_arch = "aarch64")] 84 | // CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) }, 85 | // #[cfg(target_arch = "wasm32")] 86 | // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) }, 87 | _ => native::multiply_alpha_inplace(image_view), 88 | } 89 | } 90 | 91 | fn divide( 92 | src_view: &impl ImageView, 93 | dst_view: &mut impl ImageViewMut, 94 | cpu_extensions: CpuExtensions, 95 | ) { 96 | match cpu_extensions { 97 | #[cfg(target_arch = "x86_64")] 98 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) }, 99 | #[cfg(target_arch = "x86_64")] 100 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) }, 101 | // #[cfg(target_arch = "aarch64")] 102 | // CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) }, 103 | // #[cfg(target_arch = "wasm32")] 104 | // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) }, 105 | _ => native::divide_alpha(src_view, dst_view), 106 | } 107 | } 108 | 109 | fn divide_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 110 | match cpu_extensions { 111 | #[cfg(target_arch = "x86_64")] 112 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) }, 113 | #[cfg(target_arch = "x86_64")] 114 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) }, 115 | // #[cfg(target_arch = "aarch64")] 116 | // CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) }, 117 | // #[cfg(target_arch = "wasm32")] 118 | // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) }, 119 | _ => native::divide_alpha_inplace(image_view), 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/alpha/f32x4/native.rs: -------------------------------------------------------------------------------- 1 | use num_traits::Zero; 2 | 3 | use crate::pixels::F32x4; 4 | use crate::utils::foreach_with_pre_reading; 5 | use crate::{ImageView, ImageViewMut}; 6 | 7 | pub(crate) fn multiply_alpha( 8 | src_view: &impl ImageView, 9 | dst_view: &mut impl ImageViewMut, 10 | ) { 11 | let src_rows = src_view.iter_rows(0); 12 | let dst_rows = dst_view.iter_rows_mut(0); 13 | 14 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 15 | multiply_alpha_row(src_row, dst_row); 16 | } 17 | } 18 | 19 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut) { 20 | for row in image_view.iter_rows_mut(0) { 21 | multiply_alpha_row_inplace(row); 22 | } 23 | } 24 | 25 | #[inline(always)] 26 | pub(crate) fn multiply_alpha_row(src_row: &[F32x4], dst_row: &mut [F32x4]) { 27 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 28 | let components = src_pixel.0; 29 | let alpha = components[3]; 30 | dst_pixel.0 = [ 31 | components[0] * alpha, 32 | components[1] * alpha, 33 | components[2] * alpha, 34 | alpha, 35 | ]; 36 | } 37 | } 38 | 39 | #[inline(always)] 40 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [F32x4]) { 41 | for pixel in row { 42 | let alpha = pixel.0[3]; 43 | pixel.0[0] *= alpha; 44 | pixel.0[1] *= alpha; 45 | pixel.0[2] *= alpha; 46 | } 47 | } 48 | 49 | // Divide 50 | 51 | #[inline] 52 | pub(crate) fn divide_alpha( 53 | src_view: &impl ImageView, 54 | dst_view: &mut impl ImageViewMut, 55 | ) { 56 | let src_rows = src_view.iter_rows(0); 57 | let dst_rows = dst_view.iter_rows_mut(0); 58 | 59 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 60 | divide_alpha_row(src_row, dst_row); 61 | } 62 | } 63 | 64 | #[inline] 65 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut) { 66 | for row in image_view.iter_rows_mut(0) { 67 | divide_alpha_row_inplace(row); 68 | } 69 | } 70 | 71 | #[inline(always)] 72 | pub(crate) fn divide_alpha_row(src_row: &[F32x4], dst_row: &mut [F32x4]) { 73 | foreach_with_pre_reading( 74 | src_row.iter().zip(dst_row), 75 | |(&src_pixel, dst_pixel)| (src_pixel, dst_pixel), 76 | |(src_pixel, dst_pixel)| { 77 | let components = src_pixel.0; 78 | let alpha = components[3]; 79 | if alpha.is_zero() { 80 | dst_pixel.0 = [0.; 4]; 81 | } else { 82 | let recip_alpha = 1. / alpha; 83 | dst_pixel.0 = [ 84 | components[0] * recip_alpha, 85 | components[1] * recip_alpha, 86 | components[2] * recip_alpha, 87 | alpha, 88 | ]; 89 | } 90 | }, 91 | ); 92 | } 93 | 94 | #[inline(always)] 95 | pub(crate) fn divide_alpha_row_inplace(row: &mut [F32x4]) { 96 | for pixel in row { 97 | let components = pixel.0; 98 | let alpha = components[3]; 99 | if alpha.is_zero() { 100 | pixel.0 = [0.; 4]; 101 | } else { 102 | let recip_alpha = 1. / alpha; 103 | pixel.0 = [ 104 | components[0] * recip_alpha, 105 | components[1] * recip_alpha, 106 | components[2] * recip_alpha, 107 | alpha, 108 | ]; 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/alpha/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::{pixels, CpuExtensions, ImageError, ImageView, ImageViewMut}; 2 | 3 | #[macro_use] 4 | mod common; 5 | pub(crate) mod errors; 6 | 7 | mod u8x4; 8 | cfg_if::cfg_if! { 9 | if #[cfg(not(feature = "only_u8x4"))] { 10 | mod u16x2; 11 | mod u16x4; 12 | mod u8x2; 13 | mod f32x2; 14 | mod f32x4; 15 | } 16 | } 17 | 18 | pub(crate) trait AlphaMulDiv: pixels::InnerPixel { 19 | /// Multiplies RGB-channels of source image by alpha-channel and store 20 | /// result into destination image. 21 | #[allow(unused_variables)] 22 | fn multiply_alpha( 23 | src_view: &impl ImageView, 24 | dst_view: &mut impl ImageViewMut, 25 | cpu_extensions: CpuExtensions, 26 | ) -> Result<(), ImageError> { 27 | Err(ImageError::UnsupportedPixelType) 28 | } 29 | 30 | /// Multiplies RGB-channels of image by alpha-channel inplace. 31 | #[allow(unused_variables)] 32 | fn multiply_alpha_inplace( 33 | image_view: &mut impl ImageViewMut, 34 | cpu_extensions: CpuExtensions, 35 | ) -> Result<(), ImageError> { 36 | Err(ImageError::UnsupportedPixelType) 37 | } 38 | 39 | /// Divides RGB-channels of source image by alpha-channel and store 40 | /// result into destination image. 41 | #[allow(unused_variables)] 42 | fn divide_alpha( 43 | src_view: &impl ImageView, 44 | dst_view: &mut impl ImageViewMut, 45 | cpu_extensions: CpuExtensions, 46 | ) -> Result<(), ImageError> { 47 | Err(ImageError::UnsupportedPixelType) 48 | } 49 | 50 | /// Divides RGB-channels of image by alpha-channel inplace. 51 | #[allow(unused_variables)] 52 | fn divide_alpha_inplace( 53 | image_view: &mut impl ImageViewMut, 54 | cpu_extensions: CpuExtensions, 55 | ) -> Result<(), ImageError> { 56 | Err(ImageError::UnsupportedPixelType) 57 | } 58 | } 59 | 60 | impl AlphaMulDiv for pixels::U8 {} 61 | impl AlphaMulDiv for pixels::U8x3 {} 62 | impl AlphaMulDiv for pixels::U16 {} 63 | impl AlphaMulDiv for pixels::U16x3 {} 64 | impl AlphaMulDiv for pixels::I32 {} 65 | impl AlphaMulDiv for pixels::F32 {} 66 | impl AlphaMulDiv for pixels::F32x3 {} 67 | -------------------------------------------------------------------------------- /src/alpha/u16x2/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::pixels::U16x2; 2 | use crate::{CpuExtensions, ImageError, ImageView, ImageViewMut}; 3 | 4 | use super::AlphaMulDiv; 5 | 6 | #[cfg(target_arch = "x86_64")] 7 | mod avx2; 8 | mod native; 9 | #[cfg(target_arch = "aarch64")] 10 | mod neon; 11 | #[cfg(target_arch = "x86_64")] 12 | mod sse4; 13 | #[cfg(target_arch = "wasm32")] 14 | mod wasm32; 15 | 16 | type P = U16x2; 17 | 18 | impl AlphaMulDiv for P { 19 | fn multiply_alpha( 20 | src_view: &impl ImageView, 21 | dst_view: &mut impl ImageViewMut, 22 | cpu_extensions: CpuExtensions, 23 | ) -> Result<(), ImageError> { 24 | process_two_images! { 25 | multiple(src_view, dst_view, cpu_extensions); 26 | } 27 | Ok(()) 28 | } 29 | 30 | fn multiply_alpha_inplace( 31 | image_view: &mut impl ImageViewMut, 32 | cpu_extensions: CpuExtensions, 33 | ) -> Result<(), ImageError> { 34 | process_one_images! { 35 | multiply_inplace(image_view, cpu_extensions); 36 | } 37 | Ok(()) 38 | } 39 | 40 | fn divide_alpha( 41 | src_view: &impl ImageView, 42 | dst_view: &mut impl ImageViewMut, 43 | cpu_extensions: CpuExtensions, 44 | ) -> Result<(), ImageError> { 45 | process_two_images! { 46 | divide(src_view, dst_view, cpu_extensions); 47 | } 48 | Ok(()) 49 | } 50 | 51 | fn divide_alpha_inplace( 52 | image_view: &mut impl ImageViewMut, 53 | cpu_extensions: CpuExtensions, 54 | ) -> Result<(), ImageError> { 55 | process_one_images! { 56 | divide_inplace(image_view, cpu_extensions); 57 | } 58 | Ok(()) 59 | } 60 | } 61 | 62 | fn multiple( 63 | src_view: &impl ImageView, 64 | dst_view: &mut impl ImageViewMut, 65 | cpu_extensions: CpuExtensions, 66 | ) { 67 | match cpu_extensions { 68 | #[cfg(target_arch = "x86_64")] 69 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) }, 70 | #[cfg(target_arch = "x86_64")] 71 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) }, 72 | #[cfg(target_arch = "aarch64")] 73 | CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) }, 74 | #[cfg(target_arch = "wasm32")] 75 | CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) }, 76 | _ => native::multiply_alpha(src_view, dst_view), 77 | } 78 | } 79 | 80 | fn multiply_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 81 | match cpu_extensions { 82 | #[cfg(target_arch = "x86_64")] 83 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) }, 84 | #[cfg(target_arch = "x86_64")] 85 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) }, 86 | #[cfg(target_arch = "aarch64")] 87 | CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) }, 88 | #[cfg(target_arch = "wasm32")] 89 | CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) }, 90 | _ => native::multiply_alpha_inplace(image_view), 91 | } 92 | } 93 | 94 | fn divide( 95 | src_view: &impl ImageView, 96 | dst_view: &mut impl ImageViewMut, 97 | cpu_extensions: CpuExtensions, 98 | ) { 99 | match cpu_extensions { 100 | #[cfg(target_arch = "x86_64")] 101 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) }, 102 | #[cfg(target_arch = "x86_64")] 103 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) }, 104 | #[cfg(target_arch = "aarch64")] 105 | CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) }, 106 | #[cfg(target_arch = "wasm32")] 107 | CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) }, 108 | _ => native::divide_alpha(src_view, dst_view), 109 | } 110 | } 111 | 112 | fn divide_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 113 | match cpu_extensions { 114 | #[cfg(target_arch = "x86_64")] 115 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) }, 116 | #[cfg(target_arch = "x86_64")] 117 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) }, 118 | #[cfg(target_arch = "aarch64")] 119 | CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) }, 120 | #[cfg(target_arch = "wasm32")] 121 | CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) }, 122 | _ => native::divide_alpha_inplace(image_view), 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/alpha/u16x2/native.rs: -------------------------------------------------------------------------------- 1 | use crate::alpha::common::{div_and_clip16, mul_div_65535, RECIP_ALPHA16}; 2 | use crate::pixels::U16x2; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn multiply_alpha( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | ) { 9 | let src_rows = src_view.iter_rows(0); 10 | let dst_rows = dst_view.iter_rows_mut(0); 11 | 12 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 13 | multiply_alpha_row(src_row, dst_row); 14 | } 15 | } 16 | 17 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut) { 18 | for row in image_view.iter_rows_mut(0) { 19 | multiply_alpha_row_inplace(row); 20 | } 21 | } 22 | 23 | #[inline(always)] 24 | pub(crate) fn multiply_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) { 25 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 26 | let components: [u16; 2] = src_pixel.0; 27 | let alpha = components[1]; 28 | dst_pixel.0 = [mul_div_65535(components[0], alpha), alpha]; 29 | } 30 | } 31 | 32 | #[inline(always)] 33 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U16x2]) { 34 | for pixel in row { 35 | let components: [u16; 2] = pixel.0; 36 | let alpha = components[1]; 37 | pixel.0 = [mul_div_65535(components[0], alpha), alpha]; 38 | } 39 | } 40 | 41 | // Divide 42 | 43 | #[inline] 44 | pub(crate) fn divide_alpha( 45 | src_view: &impl ImageView, 46 | dst_view: &mut impl ImageViewMut, 47 | ) { 48 | let src_rows = src_view.iter_rows(0); 49 | let dst_rows = dst_view.iter_rows_mut(0); 50 | 51 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 52 | divide_alpha_row(src_row, dst_row); 53 | } 54 | } 55 | 56 | #[inline] 57 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut) { 58 | for row in image_view.iter_rows_mut(0) { 59 | divide_alpha_row_inplace(row); 60 | } 61 | } 62 | 63 | #[inline(always)] 64 | pub(crate) fn divide_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) { 65 | src_row 66 | .iter() 67 | .zip(dst_row) 68 | .for_each(|(src_pixel, dst_pixel)| { 69 | let components: [u16; 2] = src_pixel.0; 70 | let alpha = components[1]; 71 | let recip_alpha = RECIP_ALPHA16[alpha as usize]; 72 | dst_pixel.0 = [div_and_clip16(components[0], recip_alpha), alpha]; 73 | }); 74 | } 75 | 76 | #[inline(always)] 77 | pub(crate) fn divide_alpha_row_inplace(row: &mut [U16x2]) { 78 | for pixel in row { 79 | let components: [u16; 2] = pixel.0; 80 | let alpha = components[1]; 81 | let recip_alpha = RECIP_ALPHA16[alpha as usize]; 82 | pixel.0 = [div_and_clip16(components[0], recip_alpha), alpha]; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/alpha/u16x4/native.rs: -------------------------------------------------------------------------------- 1 | use crate::alpha::common::{div_and_clip16, mul_div_65535, RECIP_ALPHA16}; 2 | use crate::pixels::U16x4; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn multiply_alpha( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | ) { 9 | let src_rows = src_view.iter_rows(0); 10 | let dst_rows = dst_view.iter_rows_mut(0); 11 | 12 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 13 | multiply_alpha_row(src_row, dst_row); 14 | } 15 | } 16 | 17 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut) { 18 | for row in image_view.iter_rows_mut(0) { 19 | multiply_alpha_row_inplace(row); 20 | } 21 | } 22 | 23 | #[inline(always)] 24 | pub(crate) fn multiply_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) { 25 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 26 | let components: [u16; 4] = src_pixel.0; 27 | let alpha = components[3]; 28 | dst_pixel.0 = [ 29 | mul_div_65535(components[0], alpha), 30 | mul_div_65535(components[1], alpha), 31 | mul_div_65535(components[2], alpha), 32 | alpha, 33 | ]; 34 | } 35 | } 36 | 37 | #[inline(always)] 38 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U16x4]) { 39 | for pixel in row { 40 | let components: [u16; 4] = pixel.0; 41 | let alpha = components[3]; 42 | pixel.0 = [ 43 | mul_div_65535(components[0], alpha), 44 | mul_div_65535(components[1], alpha), 45 | mul_div_65535(components[2], alpha), 46 | alpha, 47 | ]; 48 | } 49 | } 50 | 51 | // Divide 52 | 53 | #[inline] 54 | pub(crate) fn divide_alpha( 55 | src_view: &impl ImageView, 56 | dst_view: &mut impl ImageViewMut, 57 | ) { 58 | let src_rows = src_view.iter_rows(0); 59 | let dst_rows = dst_view.iter_rows_mut(0); 60 | 61 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 62 | divide_alpha_row(src_row, dst_row); 63 | } 64 | } 65 | 66 | #[inline] 67 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut) { 68 | for row in image_view.iter_rows_mut(0) { 69 | divide_alpha_row_inplace(row); 70 | } 71 | } 72 | 73 | #[inline(always)] 74 | pub(crate) fn divide_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) { 75 | src_row 76 | .iter() 77 | .zip(dst_row) 78 | .for_each(|(src_pixel, dst_pixel)| { 79 | let components: [u16; 4] = src_pixel.0; 80 | let alpha = components[3]; 81 | let recip_alpha = RECIP_ALPHA16[alpha as usize]; 82 | dst_pixel.0 = [ 83 | div_and_clip16(components[0], recip_alpha), 84 | div_and_clip16(components[1], recip_alpha), 85 | div_and_clip16(components[2], recip_alpha), 86 | alpha, 87 | ]; 88 | }); 89 | } 90 | 91 | #[inline(always)] 92 | pub(crate) fn divide_alpha_row_inplace(row: &mut [U16x4]) { 93 | row.iter_mut().for_each(|pixel| { 94 | let components: [u16; 4] = pixel.0; 95 | let alpha = components[3]; 96 | let recip_alpha = RECIP_ALPHA16[alpha as usize]; 97 | pixel.0 = [ 98 | div_and_clip16(components[0], recip_alpha), 99 | div_and_clip16(components[1], recip_alpha), 100 | div_and_clip16(components[2], recip_alpha), 101 | alpha, 102 | ]; 103 | }); 104 | } 105 | -------------------------------------------------------------------------------- /src/alpha/u8x2/native.rs: -------------------------------------------------------------------------------- 1 | use crate::alpha::common::{div_and_clip, mul_div_255, RECIP_ALPHA}; 2 | use crate::pixels::U8x2; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn multiply_alpha( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | ) { 9 | let src_rows = src_view.iter_rows(0); 10 | let dst_rows = dst_view.iter_rows_mut(0); 11 | 12 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 13 | multiply_alpha_row(src_row, dst_row); 14 | } 15 | } 16 | 17 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut) { 18 | for row in image_view.iter_rows_mut(0) { 19 | multiply_alpha_row_inplace(row); 20 | } 21 | } 22 | 23 | #[inline(always)] 24 | pub(crate) fn multiply_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) { 25 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 26 | let components: [u8; 2] = src_pixel.0; 27 | let alpha = components[1]; 28 | dst_pixel.0 = [mul_div_255(components[0], alpha), alpha]; 29 | } 30 | } 31 | 32 | #[inline(always)] 33 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U8x2]) { 34 | for pixel in row { 35 | let components: [u8; 2] = pixel.0; 36 | let alpha = components[1]; 37 | pixel.0 = [mul_div_255(components[0], alpha), alpha]; 38 | } 39 | } 40 | 41 | // Divide 42 | 43 | #[inline] 44 | pub(crate) fn divide_alpha( 45 | src_view: &impl ImageView, 46 | dst_view: &mut impl ImageViewMut, 47 | ) { 48 | let src_rows = src_view.iter_rows(0); 49 | let dst_rows = dst_view.iter_rows_mut(0); 50 | 51 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 52 | divide_alpha_row(src_row, dst_row); 53 | } 54 | } 55 | 56 | #[inline] 57 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut) { 58 | for dst_row in image_view.iter_rows_mut(0) { 59 | let src_row = unsafe { std::slice::from_raw_parts(dst_row.as_ptr(), dst_row.len()) }; 60 | divide_alpha_row(src_row, dst_row); 61 | } 62 | } 63 | 64 | #[inline(always)] 65 | pub(crate) fn divide_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) { 66 | src_row 67 | .iter() 68 | .zip(dst_row) 69 | .for_each(|(src_pixel, dst_pixel)| { 70 | let components: [u8; 2] = src_pixel.0; 71 | let alpha = components[1]; 72 | let recip_alpha = RECIP_ALPHA[alpha as usize]; 73 | dst_pixel.0 = [div_and_clip(components[0], recip_alpha), alpha]; 74 | }); 75 | } 76 | -------------------------------------------------------------------------------- /src/alpha/u8x4/mod.rs: -------------------------------------------------------------------------------- 1 | use super::AlphaMulDiv; 2 | use crate::pixels::U8x4; 3 | use crate::{CpuExtensions, ImageError, ImageView, ImageViewMut}; 4 | 5 | #[cfg(target_arch = "x86_64")] 6 | mod avx2; 7 | mod native; 8 | #[cfg(target_arch = "aarch64")] 9 | mod neon; 10 | #[cfg(target_arch = "x86_64")] 11 | mod sse4; 12 | #[cfg(target_arch = "wasm32")] 13 | mod wasm32; 14 | 15 | type P = U8x4; 16 | 17 | impl AlphaMulDiv for P { 18 | fn multiply_alpha( 19 | src_view: &impl ImageView, 20 | dst_view: &mut impl ImageViewMut, 21 | cpu_extensions: CpuExtensions, 22 | ) -> Result<(), ImageError> { 23 | process_two_images! { 24 | multiple(src_view, dst_view, cpu_extensions); 25 | } 26 | Ok(()) 27 | } 28 | 29 | fn multiply_alpha_inplace( 30 | image_view: &mut impl ImageViewMut, 31 | cpu_extensions: CpuExtensions, 32 | ) -> Result<(), ImageError> { 33 | process_one_images! { 34 | multiply_inplace(image_view, cpu_extensions); 35 | } 36 | Ok(()) 37 | } 38 | 39 | fn divide_alpha( 40 | src_view: &impl ImageView, 41 | dst_view: &mut impl ImageViewMut, 42 | cpu_extensions: CpuExtensions, 43 | ) -> Result<(), ImageError> { 44 | process_two_images! { 45 | divide(src_view, dst_view, cpu_extensions); 46 | } 47 | Ok(()) 48 | } 49 | 50 | fn divide_alpha_inplace( 51 | image_view: &mut impl ImageViewMut, 52 | cpu_extensions: CpuExtensions, 53 | ) -> Result<(), ImageError> { 54 | process_one_images! { 55 | divide_inplace(image_view, cpu_extensions); 56 | } 57 | Ok(()) 58 | } 59 | } 60 | 61 | fn multiple( 62 | src_view: &impl ImageView, 63 | dst_view: &mut impl ImageViewMut, 64 | cpu_extensions: CpuExtensions, 65 | ) { 66 | match cpu_extensions { 67 | #[cfg(target_arch = "x86_64")] 68 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) }, 69 | #[cfg(target_arch = "x86_64")] 70 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) }, 71 | #[cfg(target_arch = "aarch64")] 72 | CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) }, 73 | #[cfg(target_arch = "wasm32")] 74 | CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) }, 75 | _ => native::multiply_alpha(src_view, dst_view), 76 | } 77 | } 78 | 79 | fn multiply_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 80 | match cpu_extensions { 81 | #[cfg(target_arch = "x86_64")] 82 | CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) }, 83 | #[cfg(target_arch = "x86_64")] 84 | CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) }, 85 | #[cfg(target_arch = "aarch64")] 86 | CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) }, 87 | #[cfg(target_arch = "wasm32")] 88 | CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) }, 89 | _ => native::multiply_alpha_inplace(image_view), 90 | } 91 | } 92 | 93 | fn divide( 94 | src_view: &impl ImageView, 95 | dst_view: &mut impl ImageViewMut, 96 | cpu_extensions: CpuExtensions, 97 | ) { 98 | match cpu_extensions { 99 | #[cfg(target_arch = "x86_64")] 100 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) }, 101 | #[cfg(target_arch = "x86_64")] 102 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) }, 103 | #[cfg(target_arch = "aarch64")] 104 | CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) }, 105 | #[cfg(target_arch = "wasm32")] 106 | CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) }, 107 | _ => native::divide_alpha(src_view, dst_view), 108 | } 109 | } 110 | 111 | fn divide_inplace(image_view: &mut impl ImageViewMut, cpu_extensions: CpuExtensions) { 112 | match cpu_extensions { 113 | #[cfg(target_arch = "x86_64")] 114 | CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) }, 115 | #[cfg(target_arch = "x86_64")] 116 | CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) }, 117 | #[cfg(target_arch = "aarch64")] 118 | CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) }, 119 | #[cfg(target_arch = "wasm32")] 120 | CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) }, 121 | _ => native::divide_alpha_inplace(image_view), 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/alpha/u8x4/native.rs: -------------------------------------------------------------------------------- 1 | use crate::alpha::common::{div_and_clip, mul_div_255, RECIP_ALPHA}; 2 | use crate::pixels::U8x4; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn multiply_alpha( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | ) { 9 | let src_rows = src_view.iter_rows(0); 10 | let dst_rows = dst_view.iter_rows_mut(0); 11 | let rows = src_rows.zip(dst_rows); 12 | 13 | for (src_row, dst_row) in rows { 14 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row.iter_mut()) { 15 | *dst_pixel = multiply_alpha_pixel(*src_pixel); 16 | } 17 | } 18 | } 19 | 20 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut) { 21 | let rows = image_view.iter_rows_mut(0); 22 | for row in rows { 23 | multiply_alpha_row_inplace(row); 24 | } 25 | } 26 | 27 | #[inline(always)] 28 | pub(crate) fn multiply_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) { 29 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 30 | *dst_pixel = multiply_alpha_pixel(*src_pixel); 31 | } 32 | } 33 | 34 | #[inline(always)] 35 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U8x4]) { 36 | for pixel in row.iter_mut() { 37 | *pixel = multiply_alpha_pixel(*pixel); 38 | } 39 | } 40 | 41 | #[inline(always)] 42 | fn multiply_alpha_pixel(mut pixel: U8x4) -> U8x4 { 43 | let alpha = pixel.0[3]; 44 | pixel.0 = [ 45 | mul_div_255(pixel.0[0], alpha), 46 | mul_div_255(pixel.0[1], alpha), 47 | mul_div_255(pixel.0[2], alpha), 48 | alpha, 49 | ]; 50 | pixel 51 | } 52 | 53 | // Divide 54 | 55 | #[inline] 56 | pub(crate) fn divide_alpha( 57 | src_view: &impl ImageView, 58 | dst_view: &mut impl ImageViewMut, 59 | ) { 60 | let src_rows = src_view.iter_rows(0); 61 | let dst_rows = dst_view.iter_rows_mut(0); 62 | let rows = src_rows.zip(dst_rows); 63 | for (src_row, dst_row) in rows { 64 | divide_alpha_row(src_row, dst_row); 65 | } 66 | } 67 | 68 | #[inline] 69 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut) { 70 | let rows = image_view.iter_rows_mut(0); 71 | for row in rows { 72 | row.iter_mut().for_each(|pixel| { 73 | *pixel = divide_alpha_pixel(*pixel); 74 | }); 75 | } 76 | } 77 | 78 | #[inline(always)] 79 | pub(crate) fn divide_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) { 80 | for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) { 81 | *dst_pixel = divide_alpha_pixel(*src_pixel); 82 | } 83 | } 84 | 85 | #[inline(always)] 86 | fn divide_alpha_pixel(mut pixel: U8x4) -> U8x4 { 87 | let alpha = pixel.0[3]; 88 | let recip_alpha = RECIP_ALPHA[alpha as usize]; 89 | pixel.0 = [ 90 | div_and_clip(pixel.0[0], recip_alpha), 91 | div_and_clip(pixel.0[1], recip_alpha), 92 | div_and_clip(pixel.0[2], recip_alpha), 93 | alpha, 94 | ]; 95 | pixel 96 | } 97 | -------------------------------------------------------------------------------- /src/color/mappers.rs: -------------------------------------------------------------------------------- 1 | use crate::PixelComponentMapper; 2 | 3 | fn gamma_into_linear(input: f32) -> f32 { 4 | input.powf(2.2) 5 | } 6 | 7 | fn linear_into_gamma(input: f32) -> f32 { 8 | input.powf(1.0 / 2.2) 9 | } 10 | 11 | /// Create mapper to convert an image from Gamma 2.2 to linear colorspace and back. 12 | pub fn create_gamma_22_mapper() -> PixelComponentMapper { 13 | PixelComponentMapper::new(gamma_into_linear, linear_into_gamma) 14 | } 15 | 16 | /// https://en.wikipedia.org/wiki/SRGB#From_sRGB_to_CIE_XYZ 17 | /// http://www.ericbrasseur.org/gamma.html?i=2#formulas 18 | fn srgb_to_linear(input: f32) -> f32 { 19 | if input < 0.04045 { 20 | input / 12.92 21 | } else { 22 | const A: f32 = 0.055; 23 | ((input + A) / (1. + A)).powf(2.4) 24 | } 25 | } 26 | 27 | /// https://en.wikipedia.org/wiki/SRGB#From_CIE_XYZ_to_sRGB 28 | /// http://www.ericbrasseur.org/gamma.html?i=2#formulas 29 | fn linear_to_srgb(input: f32) -> f32 { 30 | if input < 0.0031308 { 31 | 12.92 * input 32 | } else { 33 | const A: f32 = 0.055; 34 | (1. + A) * input.powf(1. / 2.4) - A 35 | } 36 | } 37 | 38 | /// Create mapper to convert an image from sRGB to linear RGB colorspace and back. 39 | pub fn create_srgb_mapper() -> PixelComponentMapper { 40 | PixelComponentMapper::new(srgb_to_linear, linear_to_srgb) 41 | } 42 | -------------------------------------------------------------------------------- /src/convolution/f32x1/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::vertical_f32::vert_convolution_f32; 3 | use crate::cpu_extensions::CpuExtensions; 4 | use crate::pixels::F32; 5 | use crate::{ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | // #[cfg(target_arch = "aarch64")] 11 | // mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | // #[cfg(target_arch = "wasm32")] 15 | // mod wasm32; 16 | 17 | type P = F32; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | let coeffs_ref = &coeffs; 29 | 30 | try_process_in_threads_h! { 31 | horiz_convolution( 32 | src_view, 33 | dst_view, 34 | offset, 35 | coeffs_ref, 36 | cpu_extensions, 37 | ); 38 | } 39 | } 40 | 41 | fn vert_convolution( 42 | src_view: &impl ImageView, 43 | dst_view: &mut impl ImageViewMut, 44 | offset: u32, 45 | coeffs: Coefficients, 46 | cpu_extensions: CpuExtensions, 47 | ) { 48 | debug_assert!(src_view.width() - offset >= dst_view.width()); 49 | 50 | let coeffs_ref = &coeffs; 51 | 52 | try_process_in_threads_v! { 53 | vert_convolution( 54 | src_view, 55 | dst_view, 56 | offset, 57 | coeffs_ref, 58 | cpu_extensions, 59 | ); 60 | } 61 | } 62 | } 63 | 64 | fn horiz_convolution( 65 | src_view: &impl ImageView, 66 | dst_view: &mut impl ImageViewMut, 67 | offset: u32, 68 | coeffs: &Coefficients, 69 | cpu_extensions: CpuExtensions, 70 | ) { 71 | match cpu_extensions { 72 | #[cfg(target_arch = "x86_64")] 73 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs), 74 | #[cfg(target_arch = "x86_64")] 75 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs), 76 | // #[cfg(target_arch = "aarch64")] 77 | // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs), 78 | // #[cfg(target_arch = "wasm32")] 79 | // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs), 80 | _ => native::horiz_convolution(src_view, dst_view, offset, coeffs), 81 | } 82 | } 83 | 84 | fn vert_convolution( 85 | src_view: &impl ImageView, 86 | dst_view: &mut impl ImageViewMut, 87 | offset: u32, 88 | coeffs: &Coefficients, 89 | cpu_extensions: CpuExtensions, 90 | ) { 91 | vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions); 92 | } 93 | -------------------------------------------------------------------------------- /src/convolution/f32x1/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::F32; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn horiz_convolution( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | offset: u32, 9 | coeffs: &Coefficients, 10 | ) { 11 | let coefficients_chunks = coeffs.get_chunks(); 12 | let src_rows = src_view.iter_rows(offset); 13 | let dst_rows = dst_view.iter_rows_mut(0); 14 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 15 | for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) { 16 | let first_x_src = coeffs_chunk.start as usize; 17 | let end_x_src = first_x_src + coeffs_chunk.values.len(); 18 | let mut ss = 0.; 19 | let mut src_pixels = unsafe { src_row.get_unchecked(first_x_src..end_x_src) }; 20 | let mut coefs = coeffs_chunk.values; 21 | 22 | (coefs, src_pixels) = convolution_by_chunks::<8>(coefs, src_pixels, &mut ss); 23 | 24 | for (&k, &pixel) in coefs.iter().zip(src_pixels) { 25 | ss += pixel.0 as f64 * k; 26 | } 27 | dst_pixel.0 = ss as f32; 28 | } 29 | } 30 | } 31 | 32 | #[inline(always)] 33 | fn convolution_by_chunks<'a, 'b, const CHUNK_SIZE: usize>( 34 | coefs: &'a [f64], 35 | src_pixels: &'b [F32], 36 | ss: &mut f64, 37 | ) -> (&'a [f64], &'b [F32]) { 38 | let coef_chunks = coefs.chunks_exact(CHUNK_SIZE); 39 | let coefs = coef_chunks.remainder(); 40 | let pixel_chunks = src_pixels.chunks_exact(CHUNK_SIZE); 41 | let src_pixels = pixel_chunks.remainder(); 42 | for (ks, pixels) in coef_chunks.zip(pixel_chunks) { 43 | for (&k, &pixel) in ks.iter().zip(pixels) { 44 | *ss += pixel.0 as f64 * k; 45 | } 46 | } 47 | (coefs, src_pixels) 48 | } 49 | -------------------------------------------------------------------------------- /src/convolution/f32x1/sse4.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::F32; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | #[inline] 8 | pub(crate) fn horiz_convolution( 9 | src_view: &impl ImageView, 10 | dst_view: &mut impl ImageViewMut, 11 | offset: u32, 12 | coeffs: &Coefficients, 13 | ) { 14 | let coefficients_chunks = coeffs.get_chunks(); 15 | let dst_height = dst_view.height(); 16 | 17 | let src_iter = src_view.iter_4_rows(offset, dst_height + offset); 18 | let dst_iter = dst_view.iter_4_rows_mut(); 19 | for (src_rows, dst_rows) in src_iter.zip(dst_iter) { 20 | unsafe { 21 | horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks); 22 | } 23 | } 24 | 25 | let yy = dst_height - dst_height % 4; 26 | let src_rows = src_view.iter_rows(yy + offset); 27 | let dst_rows = dst_view.iter_rows_mut(yy); 28 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 29 | unsafe { 30 | horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks); 31 | } 32 | } 33 | } 34 | 35 | /// For safety, it is necessary to ensure the following conditions: 36 | /// - length of all rows in src_rows must be equal 37 | /// - length of all rows in dst_rows must be equal 38 | /// - coefficients_chunks.len() == dst_rows.0.len() 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() 40 | /// - precision <= MAX_COEFS_PRECISION 41 | #[target_feature(enable = "sse4.1")] 42 | unsafe fn horiz_convolution_rows( 43 | src_rows: [&[F32]; ROWS_COUNT], 44 | dst_rows: [&mut [F32]; ROWS_COUNT], 45 | coefficients_chunks: &[CoefficientsChunk], 46 | ) { 47 | let mut ll_buf = [0f64; 2]; 48 | 49 | for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { 50 | let mut x: usize = coeffs_chunk.start as usize; 51 | let mut sums = [_mm_set1_pd(0.); ROWS_COUNT]; 52 | 53 | let mut coeffs = coeffs_chunk.values; 54 | 55 | let coeffs_by_4 = coeffs.chunks_exact(4); 56 | coeffs = coeffs_by_4.remainder(); 57 | 58 | for k in coeffs_by_4 { 59 | let coeff01_f64x2 = simd_utils::loadu_pd(k, 0); 60 | let coeff23_f64x2 = simd_utils::loadu_pd(k, 2); 61 | 62 | for i in 0..ROWS_COUNT { 63 | let mut sum = sums[i]; 64 | let source = simd_utils::loadu_ps(src_rows[i], x); 65 | 66 | let pixel01_f64 = _mm_cvtps_pd(source); 67 | sum = _mm_add_pd(sum, _mm_mul_pd(pixel01_f64, coeff01_f64x2)); 68 | 69 | let pixel23_f64 = _mm_cvtps_pd(_mm_movehl_ps(source, source)); 70 | sum = _mm_add_pd(sum, _mm_mul_pd(pixel23_f64, coeff23_f64x2)); 71 | 72 | sums[i] = sum; 73 | } 74 | x += 4; 75 | } 76 | 77 | let coeffs_by_2 = coeffs.chunks_exact(2); 78 | coeffs = coeffs_by_2.remainder(); 79 | for k in coeffs_by_2 { 80 | let coeff01_f64x2 = simd_utils::loadu_pd(k, 0); 81 | 82 | for i in 0..ROWS_COUNT { 83 | let pixel0 = src_rows[i].get_unchecked(x).0; 84 | let pixel1 = src_rows[i].get_unchecked(x + 1).0; 85 | let pixel01_f64 = _mm_set_pd(pixel1 as f64, pixel0 as f64); 86 | sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(pixel01_f64, coeff01_f64x2)); 87 | } 88 | x += 2; 89 | } 90 | 91 | if let Some(&k) = coeffs.first() { 92 | let coeff0_f64x2 = _mm_set1_pd(k); 93 | 94 | for i in 0..ROWS_COUNT { 95 | let pixel0 = src_rows[i].get_unchecked(x).0; 96 | let pixel0_f64 = _mm_set_pd(0., pixel0 as f64); 97 | sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(pixel0_f64, coeff0_f64x2)); 98 | } 99 | } 100 | 101 | for i in 0..ROWS_COUNT { 102 | _mm_storeu_pd(ll_buf.as_mut_ptr(), sums[i]); 103 | let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); 104 | dst_pixel.0 = (ll_buf[0] + ll_buf[1]) as f32; 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/convolution/f32x2/avx2.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::F32x2; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | #[inline] 8 | pub(crate) fn horiz_convolution( 9 | src_view: &impl ImageView, 10 | dst_view: &mut impl ImageViewMut, 11 | offset: u32, 12 | coeffs: &Coefficients, 13 | ) { 14 | let coefficients_chunks = coeffs.get_chunks(); 15 | let dst_height = dst_view.height(); 16 | 17 | let src_iter = src_view.iter_4_rows(offset, dst_height + offset); 18 | let dst_iter = dst_view.iter_4_rows_mut(); 19 | for (src_rows, dst_rows) in src_iter.zip(dst_iter) { 20 | unsafe { 21 | horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks); 22 | } 23 | } 24 | 25 | let yy = dst_height - dst_height % 4; 26 | let src_rows = src_view.iter_rows(yy + offset); 27 | let dst_rows = dst_view.iter_rows_mut(yy); 28 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 29 | unsafe { 30 | horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks); 31 | } 32 | } 33 | } 34 | 35 | /// For safety, it is necessary to ensure the following conditions: 36 | /// - length of all rows in src_rows must be equal 37 | /// - length of all rows in dst_rows must be equal 38 | /// - coefficients_chunks.len() == dst_rows.0.len() 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() 40 | /// - precision <= MAX_COEFS_PRECISION 41 | #[target_feature(enable = "avx2")] 42 | unsafe fn horiz_convolution_rows( 43 | src_rows: [&[F32x2]; ROWS_COUNT], 44 | dst_rows: [&mut [F32x2]; ROWS_COUNT], 45 | coefficients_chunks: &[CoefficientsChunk], 46 | ) { 47 | let mut ll_buf = [0f64; 2]; 48 | 49 | for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { 50 | let mut x: usize = coeffs_chunk.start as usize; 51 | let mut ll_sum = [_mm256_set1_pd(0.); ROWS_COUNT]; 52 | 53 | let mut coeffs = coeffs_chunk.values; 54 | 55 | let coeffs_by_4 = coeffs.chunks_exact(4); 56 | coeffs = coeffs_by_4.remainder(); 57 | for k in coeffs_by_4 { 58 | let coeff0_f64x4 = _mm256_set_pd(k[1], k[1], k[0], k[0]); 59 | let coeff1_f64x4 = _mm256_set_pd(k[3], k[3], k[2], k[2]); 60 | 61 | for i in 0..ROWS_COUNT { 62 | let mut sum = ll_sum[i]; 63 | let pixels04_f32x8 = simd_utils::loadu_ps256(src_rows[i], x); 64 | 65 | let pixels01_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<0>(pixels04_f32x8)); 66 | sum = _mm256_add_pd(sum, _mm256_mul_pd(pixels01_f64x4, coeff0_f64x4)); 67 | 68 | let pixels23_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<1>(pixels04_f32x8)); 69 | sum = _mm256_add_pd(sum, _mm256_mul_pd(pixels23_f64x4, coeff1_f64x4)); 70 | 71 | ll_sum[i] = sum; 72 | } 73 | x += 4; 74 | } 75 | 76 | let coeffs_by_2 = coeffs.chunks_exact(2); 77 | coeffs = coeffs_by_2.remainder(); 78 | for k in coeffs_by_2 { 79 | let coeff_f64x4 = _mm256_set_pd(k[1], k[1], k[0], k[0]); 80 | 81 | for i in 0..ROWS_COUNT { 82 | let mut sum = ll_sum[i]; 83 | let pixels01_f32x4 = simd_utils::loadu_ps(src_rows[i], x); 84 | 85 | let pixels01_f64x4 = _mm256_cvtps_pd(pixels01_f32x4); 86 | sum = _mm256_add_pd(sum, _mm256_mul_pd(pixels01_f64x4, coeff_f64x4)); 87 | 88 | ll_sum[i] = sum; 89 | } 90 | x += 2; 91 | } 92 | 93 | if let Some(&k) = coeffs.first() { 94 | let coeff0_f64x4 = _mm256_set1_pd(k); 95 | 96 | for i in 0..ROWS_COUNT { 97 | let mut sum = ll_sum[i]; 98 | let pixel = src_rows[i].get_unchecked(x); 99 | 100 | let pixel0_f64x4 = _mm256_set_pd(0., 0., pixel.0[1] as f64, pixel.0[0] as f64); 101 | sum = _mm256_add_pd(sum, _mm256_mul_pd(pixel0_f64x4, coeff0_f64x4)); 102 | 103 | ll_sum[i] = sum; 104 | } 105 | } 106 | 107 | for i in 0..ROWS_COUNT { 108 | let sum_f64x2 = _mm_add_pd( 109 | _mm256_extractf128_pd::<0>(ll_sum[i]), 110 | _mm256_extractf128_pd::<1>(ll_sum[i]), 111 | ); 112 | _mm_storeu_pd(ll_buf.as_mut_ptr(), sum_f64x2); 113 | let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); 114 | dst_pixel.0 = ll_buf.map(|v| v as f32); 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/convolution/f32x2/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::vertical_f32::vert_convolution_f32; 2 | use crate::cpu_extensions::CpuExtensions; 3 | use crate::pixels::F32x2; 4 | use crate::{ImageView, ImageViewMut}; 5 | 6 | use super::{Coefficients, Convolution}; 7 | 8 | #[cfg(target_arch = "x86_64")] 9 | mod avx2; 10 | mod native; 11 | // #[cfg(target_arch = "aarch64")] 12 | // mod neon; 13 | #[cfg(target_arch = "x86_64")] 14 | mod sse4; 15 | // #[cfg(target_arch = "wasm32")] 16 | // mod wasm32; 17 | 18 | type P = F32x2; 19 | 20 | impl Convolution for P { 21 | fn horiz_convolution( 22 | src_view: &impl ImageView, 23 | dst_view: &mut impl ImageViewMut, 24 | offset: u32, 25 | coeffs: Coefficients, 26 | cpu_extensions: CpuExtensions, 27 | ) { 28 | debug_assert!(src_view.height() - offset >= dst_view.height()); 29 | let coeffs_ref = &coeffs; 30 | 31 | try_process_in_threads_h! { 32 | horiz_convolution( 33 | src_view, 34 | dst_view, 35 | offset, 36 | coeffs_ref, 37 | cpu_extensions, 38 | ); 39 | } 40 | } 41 | 42 | fn vert_convolution( 43 | src_view: &impl ImageView, 44 | dst_view: &mut impl ImageViewMut, 45 | offset: u32, 46 | coeffs: Coefficients, 47 | cpu_extensions: CpuExtensions, 48 | ) { 49 | debug_assert!(src_view.width() - offset >= dst_view.width()); 50 | 51 | let coeffs_ref = &coeffs; 52 | 53 | try_process_in_threads_v! { 54 | vert_convolution( 55 | src_view, 56 | dst_view, 57 | offset, 58 | coeffs_ref, 59 | cpu_extensions, 60 | ); 61 | } 62 | } 63 | } 64 | 65 | fn horiz_convolution( 66 | src_view: &impl ImageView, 67 | dst_view: &mut impl ImageViewMut, 68 | offset: u32, 69 | coeffs: &Coefficients, 70 | cpu_extensions: CpuExtensions, 71 | ) { 72 | match cpu_extensions { 73 | #[cfg(target_arch = "x86_64")] 74 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs), 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs), 77 | // #[cfg(target_arch = "aarch64")] 78 | // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs), 79 | // #[cfg(target_arch = "wasm32")] 80 | // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs), 81 | _ => native::horiz_convolution(src_view, dst_view, offset, coeffs), 82 | } 83 | } 84 | 85 | fn vert_convolution( 86 | src_view: &impl ImageView, 87 | dst_view: &mut impl ImageViewMut, 88 | offset: u32, 89 | coeffs: &Coefficients, 90 | cpu_extensions: CpuExtensions, 91 | ) { 92 | vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions); 93 | } 94 | -------------------------------------------------------------------------------- /src/convolution/f32x2/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::F32x2; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn horiz_convolution( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | offset: u32, 9 | coeffs: &Coefficients, 10 | ) { 11 | let coefficients_chunks = coeffs.get_chunks(); 12 | let src_rows = src_view.iter_rows(offset); 13 | let dst_rows = dst_view.iter_rows_mut(0); 14 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 15 | for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) { 16 | let first_x_src = coeffs_chunk.start as usize; 17 | let mut ss = [0.; 2]; 18 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 19 | for (&k, &src_pixel) in coeffs_chunk.values.iter().zip(src_pixels) { 20 | for (s, c) in ss.iter_mut().zip(src_pixel.0) { 21 | *s += c as f64 * k; 22 | } 23 | } 24 | dst_pixel.0 = ss.map(|v| v as f32); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/convolution/f32x2/sse4.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::F32x2; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | #[inline] 8 | pub(crate) fn horiz_convolution( 9 | src_view: &impl ImageView, 10 | dst_view: &mut impl ImageViewMut, 11 | offset: u32, 12 | coeffs: &Coefficients, 13 | ) { 14 | let coefficients_chunks = coeffs.get_chunks(); 15 | let dst_height = dst_view.height(); 16 | 17 | let src_iter = src_view.iter_4_rows(offset, dst_height + offset); 18 | let dst_iter = dst_view.iter_4_rows_mut(); 19 | for (src_rows, dst_rows) in src_iter.zip(dst_iter) { 20 | unsafe { 21 | horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks); 22 | } 23 | } 24 | 25 | let yy = dst_height - dst_height % 4; 26 | let src_rows = src_view.iter_rows(yy + offset); 27 | let dst_rows = dst_view.iter_rows_mut(yy); 28 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 29 | unsafe { 30 | horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks); 31 | } 32 | } 33 | } 34 | 35 | /// For safety, it is necessary to ensure the following conditions: 36 | /// - length of all rows in src_rows must be equal 37 | /// - length of all rows in dst_rows must be equal 38 | /// - coefficients_chunks.len() == dst_rows.0.len() 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() 40 | /// - precision <= MAX_COEFS_PRECISION 41 | #[target_feature(enable = "sse4.1")] 42 | unsafe fn horiz_convolution_rows( 43 | src_rows: [&[F32x2]; ROWS_COUNT], 44 | dst_rows: [&mut [F32x2]; ROWS_COUNT], 45 | coefficients_chunks: &[CoefficientsChunk], 46 | ) { 47 | let mut ll_buf = [0f64; 2]; 48 | 49 | for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { 50 | let mut x: usize = coeffs_chunk.start as usize; 51 | let mut ll_sum = [_mm_set1_pd(0.); ROWS_COUNT]; 52 | 53 | let mut coeffs = coeffs_chunk.values; 54 | 55 | let coeffs_by_2 = coeffs.chunks_exact(2); 56 | coeffs = coeffs_by_2.remainder(); 57 | 58 | for k in coeffs_by_2 { 59 | let coeff0_f64x2 = _mm_set1_pd(k[0]); 60 | let coeff1_f64x2 = _mm_set1_pd(k[1]); 61 | 62 | for i in 0..ROWS_COUNT { 63 | let mut sum = ll_sum[i]; 64 | let source = simd_utils::loadu_ps(src_rows[i], x); 65 | 66 | let pixel0_f64 = _mm_cvtps_pd(source); 67 | sum = _mm_add_pd(sum, _mm_mul_pd(pixel0_f64, coeff0_f64x2)); 68 | 69 | let pixel1_f64 = _mm_cvtps_pd(_mm_movehl_ps(source, source)); 70 | sum = _mm_add_pd(sum, _mm_mul_pd(pixel1_f64, coeff1_f64x2)); 71 | 72 | ll_sum[i] = sum; 73 | } 74 | x += 2; 75 | } 76 | 77 | if let Some(&k) = coeffs.first() { 78 | let coeff0_f64x2 = _mm_set1_pd(k); 79 | 80 | for i in 0..ROWS_COUNT { 81 | let mut sum = ll_sum[i]; 82 | let pixel = src_rows[i].get_unchecked(x); 83 | let source = _mm_set_ps(0., 0., pixel.0[1], pixel.0[0]); 84 | 85 | let pixel0_f64 = _mm_cvtps_pd(source); 86 | sum = _mm_add_pd(sum, _mm_mul_pd(pixel0_f64, coeff0_f64x2)); 87 | 88 | ll_sum[i] = sum; 89 | } 90 | } 91 | 92 | for i in 0..ROWS_COUNT { 93 | _mm_storeu_pd(ll_buf.as_mut_ptr(), ll_sum[i]); 94 | let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); 95 | dst_pixel.0 = ll_buf.map(|v| v as f32); 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/convolution/f32x3/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::vertical_f32::vert_convolution_f32; 2 | use crate::cpu_extensions::CpuExtensions; 3 | use crate::pixels::F32x3; 4 | use crate::{ImageView, ImageViewMut}; 5 | 6 | use super::{Coefficients, Convolution}; 7 | 8 | #[cfg(target_arch = "x86_64")] 9 | mod avx2; 10 | mod native; 11 | // #[cfg(target_arch = "aarch64")] 12 | // mod neon; 13 | #[cfg(target_arch = "x86_64")] 14 | mod sse4; 15 | // #[cfg(target_arch = "wasm32")] 16 | // mod wasm32; 17 | 18 | type P = F32x3; 19 | 20 | impl Convolution for P { 21 | fn horiz_convolution( 22 | src_view: &impl ImageView, 23 | dst_view: &mut impl ImageViewMut, 24 | offset: u32, 25 | coeffs: Coefficients, 26 | cpu_extensions: CpuExtensions, 27 | ) { 28 | debug_assert!(src_view.height() - offset >= dst_view.height()); 29 | let coeffs_ref = &coeffs; 30 | 31 | try_process_in_threads_h! { 32 | horiz_convolution( 33 | src_view, 34 | dst_view, 35 | offset, 36 | coeffs_ref, 37 | cpu_extensions, 38 | ); 39 | } 40 | } 41 | 42 | fn vert_convolution( 43 | src_view: &impl ImageView, 44 | dst_view: &mut impl ImageViewMut, 45 | offset: u32, 46 | coeffs: Coefficients, 47 | cpu_extensions: CpuExtensions, 48 | ) { 49 | debug_assert!(src_view.width() - offset >= dst_view.width()); 50 | 51 | let coeffs_ref = &coeffs; 52 | 53 | try_process_in_threads_v! { 54 | vert_convolution( 55 | src_view, 56 | dst_view, 57 | offset, 58 | coeffs_ref, 59 | cpu_extensions, 60 | ); 61 | } 62 | } 63 | } 64 | 65 | fn horiz_convolution( 66 | src_view: &impl ImageView, 67 | dst_view: &mut impl ImageViewMut, 68 | offset: u32, 69 | coeffs: &Coefficients, 70 | cpu_extensions: CpuExtensions, 71 | ) { 72 | match cpu_extensions { 73 | #[cfg(target_arch = "x86_64")] 74 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs), 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs), 77 | // #[cfg(target_arch = "aarch64")] 78 | // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs), 79 | // #[cfg(target_arch = "wasm32")] 80 | // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs), 81 | _ => native::horiz_convolution(src_view, dst_view, offset, coeffs), 82 | } 83 | } 84 | 85 | fn vert_convolution( 86 | src_view: &impl ImageView, 87 | dst_view: &mut impl ImageViewMut, 88 | offset: u32, 89 | coeffs: &Coefficients, 90 | cpu_extensions: CpuExtensions, 91 | ) { 92 | vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions); 93 | } 94 | -------------------------------------------------------------------------------- /src/convolution/f32x3/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::F32x3; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn horiz_convolution( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | offset: u32, 9 | coeffs: &Coefficients, 10 | ) { 11 | let coefficients_chunks = coeffs.get_chunks(); 12 | let src_rows = src_view.iter_rows(offset); 13 | let dst_rows = dst_view.iter_rows_mut(0); 14 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 15 | for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) { 16 | let first_x_src = coeffs_chunk.start as usize; 17 | let mut ss = [0.; 3]; 18 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 19 | for (&k, &src_pixel) in coeffs_chunk.values.iter().zip(src_pixels) { 20 | for (s, c) in ss.iter_mut().zip(src_pixel.0) { 21 | *s += c as f64 * k; 22 | } 23 | } 24 | dst_pixel.0 = ss.map(|v| v as f32); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/convolution/f32x4/avx2.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::F32x4; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | #[inline] 8 | pub(crate) fn horiz_convolution( 9 | src_view: &impl ImageView, 10 | dst_view: &mut impl ImageViewMut, 11 | offset: u32, 12 | coeffs: &Coefficients, 13 | ) { 14 | let coefficients_chunks = coeffs.get_chunks(); 15 | let dst_height = dst_view.height(); 16 | 17 | let src_iter = src_view.iter_4_rows(offset, dst_height + offset); 18 | let dst_iter = dst_view.iter_4_rows_mut(); 19 | for (src_rows, dst_rows) in src_iter.zip(dst_iter) { 20 | unsafe { 21 | horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks); 22 | } 23 | } 24 | 25 | let yy = dst_height - dst_height % 4; 26 | let src_rows = src_view.iter_rows(yy + offset); 27 | let dst_rows = dst_view.iter_rows_mut(yy); 28 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 29 | unsafe { 30 | horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks); 31 | } 32 | } 33 | } 34 | 35 | /// For safety, it is necessary to ensure the following conditions: 36 | /// - length of all rows in src_rows must be equal 37 | /// - length of all rows in dst_rows must be equal 38 | /// - coefficients_chunks.len() == dst_rows.0.len() 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() 40 | /// - precision <= MAX_COEFS_PRECISION 41 | #[target_feature(enable = "avx2")] 42 | unsafe fn horiz_convolution_rows( 43 | src_rows: [&[F32x4]; ROWS_COUNT], 44 | dst_rows: [&mut [F32x4]; ROWS_COUNT], 45 | coefficients_chunks: &[CoefficientsChunk], 46 | ) { 47 | for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { 48 | let mut x: usize = coeffs_chunk.start as usize; 49 | let mut rgba_sums = [_mm256_set1_pd(0.); ROWS_COUNT]; 50 | 51 | let mut coeffs = coeffs_chunk.values; 52 | 53 | let coeffs_by_2 = coeffs.chunks_exact(2); 54 | coeffs = coeffs_by_2.remainder(); 55 | for k in coeffs_by_2 { 56 | let coeff0_f64x4 = _mm256_set1_pd(k[0]); 57 | let coeff1_f64x4 = _mm256_set1_pd(k[1]); 58 | 59 | for r in 0..ROWS_COUNT { 60 | let pixel01 = simd_utils::loadu_ps256(src_rows[r], x); 61 | 62 | let pixel0_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<0>(pixel01)); 63 | rgba_sums[r] = 64 | _mm256_add_pd(rgba_sums[r], _mm256_mul_pd(pixel0_f64x4, coeff0_f64x4)); 65 | 66 | let pixels1_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<1>(pixel01)); 67 | rgba_sums[r] = 68 | _mm256_add_pd(rgba_sums[r], _mm256_mul_pd(pixels1_f64x4, coeff1_f64x4)); 69 | } 70 | x += 2; 71 | } 72 | 73 | if let Some(&k) = coeffs.first() { 74 | let coeff0_f64x4 = _mm256_set1_pd(k); 75 | 76 | for r in 0..ROWS_COUNT { 77 | let pixel0 = simd_utils::loadu_ps(src_rows[r], x); 78 | 79 | let pixel0_f64x4 = _mm256_cvtps_pd(pixel0); 80 | rgba_sums[r] = 81 | _mm256_add_pd(rgba_sums[r], _mm256_mul_pd(pixel0_f64x4, coeff0_f64x4)); 82 | } 83 | } 84 | 85 | for r in 0..ROWS_COUNT { 86 | let dst_pixel = dst_rows[r].get_unchecked_mut(dst_x); 87 | let rgba_f32x4 = _mm256_cvtpd_ps(rgba_sums[r]); 88 | _mm_storeu_ps(dst_pixel.0.as_mut_ptr(), rgba_f32x4); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/convolution/f32x4/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::vertical_f32::vert_convolution_f32; 2 | use crate::cpu_extensions::CpuExtensions; 3 | use crate::pixels::F32x4; 4 | use crate::{ImageView, ImageViewMut}; 5 | 6 | use super::{Coefficients, Convolution}; 7 | 8 | #[cfg(target_arch = "x86_64")] 9 | mod avx2; 10 | mod native; 11 | // #[cfg(target_arch = "aarch64")] 12 | // mod neon; 13 | #[cfg(target_arch = "x86_64")] 14 | mod sse4; 15 | // #[cfg(target_arch = "wasm32")] 16 | // mod wasm32; 17 | 18 | type P = F32x4; 19 | 20 | impl Convolution for P { 21 | fn horiz_convolution( 22 | src_view: &impl ImageView, 23 | dst_view: &mut impl ImageViewMut, 24 | offset: u32, 25 | coeffs: Coefficients, 26 | cpu_extensions: CpuExtensions, 27 | ) { 28 | debug_assert!(src_view.height() - offset >= dst_view.height()); 29 | let coeffs_ref = &coeffs; 30 | 31 | try_process_in_threads_h! { 32 | horiz_convolution( 33 | src_view, 34 | dst_view, 35 | offset, 36 | coeffs_ref, 37 | cpu_extensions, 38 | ); 39 | } 40 | } 41 | 42 | fn vert_convolution( 43 | src_view: &impl ImageView, 44 | dst_view: &mut impl ImageViewMut, 45 | offset: u32, 46 | coeffs: Coefficients, 47 | cpu_extensions: CpuExtensions, 48 | ) { 49 | debug_assert!(src_view.width() - offset >= dst_view.width()); 50 | 51 | let coeffs_ref = &coeffs; 52 | 53 | try_process_in_threads_v! { 54 | vert_convolution( 55 | src_view, 56 | dst_view, 57 | offset, 58 | coeffs_ref, 59 | cpu_extensions, 60 | ); 61 | } 62 | } 63 | } 64 | 65 | fn horiz_convolution( 66 | src_view: &impl ImageView, 67 | dst_view: &mut impl ImageViewMut, 68 | offset: u32, 69 | coeffs: &Coefficients, 70 | cpu_extensions: CpuExtensions, 71 | ) { 72 | match cpu_extensions { 73 | #[cfg(target_arch = "x86_64")] 74 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs), 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs), 77 | // #[cfg(target_arch = "aarch64")] 78 | // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs), 79 | // #[cfg(target_arch = "wasm32")] 80 | // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs), 81 | _ => native::horiz_convolution(src_view, dst_view, offset, coeffs), 82 | } 83 | } 84 | 85 | fn vert_convolution( 86 | src_view: &impl ImageView, 87 | dst_view: &mut impl ImageViewMut, 88 | offset: u32, 89 | coeffs: &Coefficients, 90 | cpu_extensions: CpuExtensions, 91 | ) { 92 | vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions); 93 | } 94 | -------------------------------------------------------------------------------- /src/convolution/f32x4/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::F32x4; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn horiz_convolution( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | offset: u32, 9 | coeffs: &Coefficients, 10 | ) { 11 | let coefficients_chunks = coeffs.get_chunks(); 12 | let src_rows = src_view.iter_rows(offset); 13 | let dst_rows = dst_view.iter_rows_mut(0); 14 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 15 | for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) { 16 | let first_x_src = coeffs_chunk.start as usize; 17 | let mut ss = [0.; 4]; 18 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 19 | for (&k, &src_pixel) in coeffs_chunk.values.iter().zip(src_pixels) { 20 | for (s, c) in ss.iter_mut().zip(src_pixel.0) { 21 | *s += c as f64 * k; 22 | } 23 | } 24 | dst_pixel.0 = ss.map(|v| v as f32); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/convolution/f32x4/sse4.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::F32x4; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | #[inline] 8 | pub(crate) fn horiz_convolution( 9 | src_view: &impl ImageView, 10 | dst_view: &mut impl ImageViewMut, 11 | offset: u32, 12 | coeffs: &Coefficients, 13 | ) { 14 | let coefficients_chunks = coeffs.get_chunks(); 15 | let dst_height = dst_view.height(); 16 | 17 | let src_iter = src_view.iter_4_rows(offset, dst_height + offset); 18 | let dst_iter = dst_view.iter_4_rows_mut(); 19 | for (src_rows, dst_rows) in src_iter.zip(dst_iter) { 20 | unsafe { 21 | horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks); 22 | } 23 | } 24 | 25 | let yy = dst_height - dst_height % 4; 26 | let src_rows = src_view.iter_rows(yy + offset); 27 | let dst_rows = dst_view.iter_rows_mut(yy); 28 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 29 | unsafe { 30 | horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks); 31 | } 32 | } 33 | } 34 | 35 | /// For safety, it is necessary to ensure the following conditions: 36 | /// - length of all rows in src_rows must be equal 37 | /// - length of all rows in dst_rows must be equal 38 | /// - coefficients_chunks.len() == dst_rows.0.len() 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() 40 | /// - precision <= MAX_COEFS_PRECISION 41 | #[target_feature(enable = "sse4.1")] 42 | unsafe fn horiz_convolution_rows( 43 | src_rows: [&[F32x4]; ROWS_COUNT], 44 | dst_rows: [&mut [F32x4]; ROWS_COUNT], 45 | coefficients_chunks: &[CoefficientsChunk], 46 | ) { 47 | let mut rg_buf = [0f64; 2]; 48 | let mut ba_buf = [0f64; 2]; 49 | 50 | for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { 51 | let mut x: usize = coeffs_chunk.start as usize; 52 | let mut rg_sums = [_mm_set1_pd(0.); ROWS_COUNT]; 53 | let mut ba_sums = [_mm_set1_pd(0.); ROWS_COUNT]; 54 | 55 | for &k in coeffs_chunk.values { 56 | let coeffs_f64x2 = _mm_set1_pd(k); 57 | 58 | for r in 0..ROWS_COUNT { 59 | let pixel = simd_utils::loadu_ps(src_rows[r], x); 60 | let rg_f64x2 = _mm_cvtps_pd(pixel); 61 | rg_sums[r] = _mm_add_pd(rg_sums[r], _mm_mul_pd(rg_f64x2, coeffs_f64x2)); 62 | let ba_f64x2 = _mm_cvtps_pd(_mm_movehl_ps(pixel, pixel)); 63 | ba_sums[r] = _mm_add_pd(ba_sums[r], _mm_mul_pd(ba_f64x2, coeffs_f64x2)); 64 | } 65 | x += 1; 66 | } 67 | 68 | for i in 0..ROWS_COUNT { 69 | _mm_storeu_pd(rg_buf.as_mut_ptr(), rg_sums[i]); 70 | _mm_storeu_pd(ba_buf.as_mut_ptr(), ba_sums[i]); 71 | let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); 72 | dst_pixel.0 = [ 73 | rg_buf[0] as f32, 74 | rg_buf[1] as f32, 75 | ba_buf[0] as f32, 76 | ba_buf[1] as f32, 77 | ]; 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/convolution/i32x1/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::pixels::I32; 3 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 4 | 5 | mod native; 6 | 7 | type P = I32; 8 | 9 | impl Convolution for P { 10 | fn horiz_convolution( 11 | src_view: &impl ImageView, 12 | dst_view: &mut impl ImageViewMut, 13 | offset: u32, 14 | coeffs: Coefficients, 15 | _cpu_extensions: CpuExtensions, 16 | ) { 17 | debug_assert!(src_view.height() - offset >= dst_view.height()); 18 | let coeffs_ref = &coeffs; 19 | 20 | try_process_in_threads_h! { 21 | horiz_convolution( 22 | src_view, 23 | dst_view, 24 | offset, 25 | coeffs_ref, 26 | ); 27 | } 28 | } 29 | 30 | fn vert_convolution( 31 | src_view: &impl ImageView, 32 | dst_view: &mut impl ImageViewMut, 33 | offset: u32, 34 | coeffs: Coefficients, 35 | _cpu_extensions: CpuExtensions, 36 | ) { 37 | debug_assert!(src_view.width() - offset >= dst_view.width()); 38 | 39 | let coeffs_ref = &coeffs; 40 | 41 | try_process_in_threads_v! { 42 | vert_convolution( 43 | src_view, 44 | dst_view, 45 | offset, 46 | coeffs_ref, 47 | ); 48 | } 49 | } 50 | } 51 | 52 | #[inline(always)] 53 | fn horiz_convolution( 54 | src_view: &impl ImageView, 55 | dst_view: &mut impl ImageViewMut, 56 | offset: u32, 57 | coefficients: &Coefficients, 58 | ) { 59 | native::horiz_convolution(src_view, dst_view, offset, coefficients); 60 | } 61 | 62 | #[inline(always)] 63 | fn vert_convolution( 64 | src_view: &impl ImageView, 65 | dst_view: &mut impl ImageViewMut, 66 | offset: u32, 67 | coefficients: &Coefficients, 68 | ) { 69 | native::vert_convolution(src_view, dst_view, offset, coefficients); 70 | } 71 | -------------------------------------------------------------------------------- /src/convolution/i32x1/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::I32; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn horiz_convolution( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | offset: u32, 9 | coeffs: &Coefficients, 10 | ) { 11 | let coefficients_chunks = coeffs.get_chunks(); 12 | let src_rows = src_view.iter_rows(offset); 13 | let dst_rows = dst_view.iter_rows_mut(0); 14 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 15 | for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) { 16 | let first_x_src = coeffs_chunk.start as usize; 17 | let mut ss = 0.; 18 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 19 | for (&k, &pixel) in coeffs_chunk.values.iter().zip(src_pixels) { 20 | ss += pixel.0 as f64 * k; 21 | } 22 | dst_pixel.0 = ss.round() as i32; 23 | } 24 | } 25 | } 26 | 27 | pub(crate) fn vert_convolution( 28 | src_view: &impl ImageView, 29 | dst_view: &mut impl ImageViewMut, 30 | offset: u32, 31 | coeffs: &Coefficients, 32 | ) { 33 | let coefficients_chunks = coeffs.get_chunks(); 34 | let dst_rows = dst_view.iter_rows_mut(0); 35 | let start_src_x = offset as usize; 36 | for (&coeffs_chunk, dst_row) in coefficients_chunks.iter().zip(dst_rows) { 37 | let first_y_src = coeffs_chunk.start; 38 | let mut src_x = start_src_x; 39 | for dst_pixel in dst_row.iter_mut() { 40 | let mut ss = 0.; 41 | let src_rows = src_view.iter_rows(first_y_src); 42 | for (src_row, &k) in src_rows.zip(coeffs_chunk.values) { 43 | let src_pixel = unsafe { src_row.get_unchecked(src_x) }; 44 | ss += src_pixel.0 as f64 * k; 45 | } 46 | dst_pixel.0 = ss.round() as i32; 47 | src_x += 1; 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/convolution/macros.rs: -------------------------------------------------------------------------------- 1 | macro_rules! constify_imm8 { 2 | ($imm8:expr, $expand:ident) => { 3 | #[allow(overflowing_literals)] 4 | match ($imm8) & 0b0011_1111 { 5 | 0 => {} 6 | 1 => $expand!(1), 7 | 2 => $expand!(2), 8 | 3 => $expand!(3), 9 | 4 => $expand!(4), 10 | 5 => $expand!(5), 11 | 6 => $expand!(6), 12 | 7 => $expand!(7), 13 | 8 => $expand!(8), 14 | 9 => $expand!(9), 15 | 10 => $expand!(10), 16 | 12 => $expand!(12), 17 | 13 => $expand!(13), 18 | 14 => $expand!(14), 19 | 15 => $expand!(15), 20 | 16 => $expand!(16), 21 | 17 => $expand!(17), 22 | 18 => $expand!(18), 23 | 19 => $expand!(19), 24 | 20 => $expand!(20), 25 | 21 => $expand!(21), 26 | 22 => $expand!(22), 27 | 23 => $expand!(23), 28 | 24 => $expand!(24), 29 | 25 => $expand!(25), 30 | 26 => $expand!(26), 31 | 27 => $expand!(27), 32 | 28 => $expand!(28), 33 | 29 => $expand!(29), 34 | 30 => $expand!(30), 35 | 31 => $expand!(31), 36 | _ => unreachable!(), 37 | } 38 | }; 39 | } 40 | 41 | #[cfg(target_arch = "aarch64")] 42 | macro_rules! constify_64_imm8 { 43 | ($imm8:expr, $expand:ident) => { 44 | #[allow(overflowing_literals)] 45 | match ($imm8) & 0b0111_1111 { 46 | 0 => {} 47 | 1 => $expand!(1), 48 | 2 => $expand!(2), 49 | 3 => $expand!(3), 50 | 4 => $expand!(4), 51 | 5 => $expand!(5), 52 | 6 => $expand!(6), 53 | 7 => $expand!(7), 54 | 8 => $expand!(8), 55 | 9 => $expand!(9), 56 | 10 => $expand!(10), 57 | 12 => $expand!(12), 58 | 13 => $expand!(13), 59 | 14 => $expand!(14), 60 | 15 => $expand!(15), 61 | 16 => $expand!(16), 62 | 17 => $expand!(17), 63 | 18 => $expand!(18), 64 | 19 => $expand!(19), 65 | 20 => $expand!(20), 66 | 21 => $expand!(21), 67 | 22 => $expand!(22), 68 | 23 => $expand!(23), 69 | 24 => $expand!(24), 70 | 25 => $expand!(25), 71 | 26 => $expand!(26), 72 | 27 => $expand!(27), 73 | 28 => $expand!(28), 74 | 29 => $expand!(29), 75 | 30 => $expand!(30), 76 | 31 => $expand!(31), 77 | 32 => $expand!(32), 78 | 33 => $expand!(33), 79 | 34 => $expand!(34), 80 | 35 => $expand!(35), 81 | 36 => $expand!(36), 82 | 37 => $expand!(37), 83 | 38 => $expand!(38), 84 | 39 => $expand!(39), 85 | 40 => $expand!(40), 86 | 41 => $expand!(41), 87 | 42 => $expand!(42), 88 | 43 => $expand!(43), 89 | 44 => $expand!(44), 90 | 45 => $expand!(45), 91 | 46 => $expand!(46), 92 | 47 => $expand!(47), 93 | 48 => $expand!(48), 94 | 49 => $expand!(49), 95 | 50 => $expand!(50), 96 | 51 => $expand!(51), 97 | 52 => $expand!(52), 98 | 53 => $expand!(53), 99 | 54 => $expand!(54), 100 | 55 => $expand!(55), 101 | 56 => $expand!(56), 102 | 57 => $expand!(57), 103 | 58 => $expand!(58), 104 | 59 => $expand!(59), 105 | 60 => $expand!(60), 106 | 61 => $expand!(61), 107 | 62 => $expand!(62), 108 | 63 => $expand!(63), 109 | _ => unreachable!(), 110 | } 111 | }; 112 | } 113 | -------------------------------------------------------------------------------- /src/convolution/u16x1/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer32; 3 | use crate::convolution::vertical_u16::vert_convolution_u16; 4 | use crate::pixels::U16; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U16; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer32::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer32::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u16( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer32, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u16x1/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer32; 2 | use crate::pixels::U16; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer32, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let coefficients_chunks = normalizer.chunks(); 14 | let initial = 1i64 << (precision - 1); 15 | 16 | let src_rows = src_view.iter_rows(offset); 17 | let dst_rows = dst_view.iter_rows_mut(0); 18 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = initial; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | ss += src_pixel.0 as i64 * (k as i64); 25 | } 26 | dst_pixel.0 = normalizer.clip(ss); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/convolution/u16x2/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer32; 3 | use crate::convolution::vertical_u16::vert_convolution_u16; 4 | use crate::pixels::U16x2; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U16x2; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer32::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer32::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u16( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer32, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u16x2/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer32; 2 | use crate::pixels::U16x2; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer32, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let coefficients_chunks = normalizer.chunks(); 14 | let initial: i64 = 1 << (precision - 1); 15 | 16 | let src_rows = src_view.iter_rows(offset); 17 | let dst_rows = dst_view.iter_rows_mut(0); 18 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = [initial; 2]; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | for (i, s) in ss.iter_mut().enumerate() { 25 | *s += src_pixel.0[i] as i64 * (k as i64); 26 | } 27 | } 28 | for (i, s) in ss.iter().copied().enumerate() { 29 | dst_pixel.0[i] = normalizer.clip(s); 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/convolution/u16x3/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer32; 3 | use crate::convolution::vertical_u16::vert_convolution_u16; 4 | use crate::pixels::U16x3; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U16x3; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer32::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer32::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u16( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer32, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u16x3/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer32; 2 | use crate::pixels::U16x3; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer32, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let coefficients_chunks = normalizer.chunks(); 14 | let initial = 1i64 << (precision - 1); 15 | 16 | let src_rows = src_view.iter_rows(offset); 17 | let dst_rows = dst_view.iter_rows_mut(0); 18 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = [initial; 3]; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | for (s, c) in ss.iter_mut().zip(src_pixel.0) { 25 | *s += c as i64 * (k as i64); 26 | } 27 | } 28 | for (i, s) in ss.iter().copied().enumerate() { 29 | dst_pixel.0[i] = normalizer.clip(s); 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/convolution/u16x4/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer32; 3 | use crate::convolution::vertical_u16::vert_convolution_u16; 4 | use crate::pixels::U16x4; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U16x4; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer32::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer32::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u16( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer32, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u16x4/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer32; 2 | use crate::pixels::U16x4; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer32, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let coefficients_chunks = normalizer.chunks(); 14 | let initial: i64 = 1 << (precision - 1); 15 | 16 | let src_rows = src_view.iter_rows(offset); 17 | let dst_rows = dst_view.iter_rows_mut(0); 18 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = [initial; 4]; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | for (i, s) in ss.iter_mut().enumerate() { 25 | *s += src_pixel.0[i] as i64 * (k as i64); 26 | } 27 | } 28 | for (i, s) in ss.iter().copied().enumerate() { 29 | dst_pixel.0[i] = normalizer.clip(s); 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/convolution/u8x1/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer16; 3 | use crate::convolution::vertical_u8::vert_convolution_u8; 4 | use crate::pixels::U8; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U8; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer16::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer16::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u8( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer16, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u8x1/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer16; 2 | use crate::pixels::U8; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer16, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let initial = 1 << (precision - 1); 14 | let coefficients = normalizer.chunks(); 15 | 16 | let src_rows = src_view.iter_rows(offset); 17 | let dst_rows = dst_view.iter_rows_mut(0); 18 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = initial; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, &src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | ss += src_pixel.0 as i32 * (k as i32); 25 | } 26 | dst_pixel.0 = unsafe { normalizer.clip(ss) }; 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/convolution/u8x2/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer16; 3 | use crate::convolution::vertical_u8::vert_convolution_u8; 4 | use crate::pixels::U8x2; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U8x2; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer16::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer16::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u8( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer16, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u8x2/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer16; 2 | use crate::pixels::U8x2; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | pub(crate) fn horiz_convolution( 6 | src_view: &impl ImageView, 7 | dst_view: &mut impl ImageViewMut, 8 | offset: u32, 9 | normalizer: &Normalizer16, 10 | ) { 11 | let precision = normalizer.precision(); 12 | let coefficients_chunks = normalizer.chunks(); 13 | let initial = 1 << (precision - 1); 14 | 15 | let src_rows = src_view.iter_rows(offset); 16 | let dst_rows = dst_view.iter_rows_mut(0); 17 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 18 | for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) { 19 | let first_x_src = coeffs_chunk.start as usize; 20 | let ks = coeffs_chunk.values(); 21 | let mut ss = [initial; 2]; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, &src_pixel) in ks.iter().zip(src_pixels) { 24 | for (s, c) in ss.iter_mut().zip(src_pixel.0) { 25 | *s += c as i32 * (k as i32); 26 | } 27 | } 28 | dst_pixel.0 = ss.map(|v| unsafe { normalizer.clip(v) }); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/convolution/u8x3/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer16; 3 | use crate::convolution::vertical_u8::vert_convolution_u8; 4 | use crate::pixels::U8x3; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U8x3; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer16::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer16::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u8( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer16, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u8x3/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer16; 2 | use crate::pixels::U8x3; 3 | use crate::{ImageView, ImageViewMut}; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer16, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let coefficients = normalizer.chunks(); 14 | let initial = 1i32 << (precision - 1); 15 | 16 | let src_rows = src_view.iter_rows(offset); 17 | let dst_rows = dst_view.iter_rows_mut(0); 18 | for (dst_row, src_row) in dst_rows.zip(src_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = [initial; 3]; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | for (s, c) in ss.iter_mut().zip(src_pixel.0) { 25 | *s += c as i32 * (k as i32); 26 | } 27 | } 28 | dst_pixel.0 = ss.map(|v| unsafe { normalizer.clip(v) }); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/convolution/u8x4/mod.rs: -------------------------------------------------------------------------------- 1 | use super::{Coefficients, Convolution}; 2 | use crate::convolution::optimisations::Normalizer16; 3 | use crate::convolution::vertical_u8::vert_convolution_u8; 4 | use crate::pixels::U8x4; 5 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 6 | 7 | #[cfg(target_arch = "x86_64")] 8 | mod avx2; 9 | mod native; 10 | #[cfg(target_arch = "aarch64")] 11 | mod neon; 12 | #[cfg(target_arch = "x86_64")] 13 | mod sse4; 14 | #[cfg(target_arch = "wasm32")] 15 | mod wasm32; 16 | 17 | type P = U8x4; 18 | 19 | impl Convolution for P { 20 | fn horiz_convolution( 21 | src_view: &impl ImageView, 22 | dst_view: &mut impl ImageViewMut, 23 | offset: u32, 24 | coeffs: Coefficients, 25 | cpu_extensions: CpuExtensions, 26 | ) { 27 | debug_assert!(src_view.height() - offset >= dst_view.height()); 28 | 29 | let normalizer = Normalizer16::new(coeffs); 30 | let normalizer_ref = &normalizer; 31 | 32 | try_process_in_threads_h! { 33 | horiz_convolution( 34 | src_view, 35 | dst_view, 36 | offset, 37 | normalizer_ref, 38 | cpu_extensions, 39 | ); 40 | } 41 | } 42 | 43 | fn vert_convolution( 44 | src_view: &impl ImageView, 45 | dst_view: &mut impl ImageViewMut, 46 | offset: u32, 47 | coeffs: Coefficients, 48 | cpu_extensions: CpuExtensions, 49 | ) { 50 | debug_assert!(src_view.width() - offset >= dst_view.width()); 51 | 52 | let normalizer = Normalizer16::new(coeffs); 53 | let normalizer_ref = &normalizer; 54 | 55 | try_process_in_threads_v! { 56 | vert_convolution_u8( 57 | src_view, 58 | dst_view, 59 | offset, 60 | normalizer_ref, 61 | cpu_extensions, 62 | ); 63 | } 64 | } 65 | } 66 | 67 | fn horiz_convolution( 68 | src_view: &impl ImageView, 69 | dst_view: &mut impl ImageViewMut, 70 | offset: u32, 71 | normalizer: &Normalizer16, 72 | cpu_extensions: CpuExtensions, 73 | ) { 74 | match cpu_extensions { 75 | #[cfg(target_arch = "x86_64")] 76 | CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer), 77 | #[cfg(target_arch = "x86_64")] 78 | CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer), 79 | #[cfg(target_arch = "aarch64")] 80 | CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer), 81 | #[cfg(target_arch = "wasm32")] 82 | CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer), 83 | _ => native::horiz_convolution(src_view, dst_view, offset, normalizer), 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/convolution/u8x4/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer16; 2 | use crate::image_view::{ImageView, ImageViewMut}; 3 | use crate::pixels::U8x4; 4 | 5 | #[inline(always)] 6 | pub(crate) fn horiz_convolution( 7 | src_view: &impl ImageView, 8 | dst_view: &mut impl ImageViewMut, 9 | offset: u32, 10 | normalizer: &Normalizer16, 11 | ) { 12 | let precision = normalizer.precision(); 13 | let initial = 1 << (precision - 1); 14 | let coefficients = normalizer.chunks(); 15 | let src_rows = src_view.iter_rows(offset); 16 | let dst_rows = dst_view.iter_rows_mut(0); 17 | 18 | for (src_row, dst_row) in src_rows.zip(dst_rows) { 19 | for (coeffs_chunk, dst_pixel) in coefficients.iter().zip(dst_row.iter_mut()) { 20 | let first_x_src = coeffs_chunk.start as usize; 21 | let mut ss = [initial; 4]; 22 | let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) }; 23 | for (&k, &src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) { 24 | for (i, s) in ss.iter_mut().enumerate() { 25 | *s += src_pixel.0[i] as i32 * (k as i32); 26 | } 27 | } 28 | dst_pixel.0 = ss.map(|v| unsafe { normalizer.clip(v) }); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/convolution/vertical_f32/avx2.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::InnerPixel; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | use super::native; 8 | 9 | pub(crate) fn vert_convolution( 10 | src_view: &impl ImageView, 11 | dst_view: &mut impl ImageViewMut, 12 | offset: u32, 13 | coeffs: &Coefficients, 14 | ) where 15 | T: InnerPixel, 16 | { 17 | let coefficients_chunks = coeffs.get_chunks(); 18 | let src_x = offset as usize * T::count_of_components(); 19 | 20 | let dst_rows = dst_view.iter_rows_mut(0); 21 | for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) { 22 | unsafe { 23 | vert_convolution_into_one_row_f32(src_view, dst_row, src_x, coeffs_chunk); 24 | } 25 | } 26 | } 27 | 28 | #[target_feature(enable = "avx2")] 29 | unsafe fn vert_convolution_into_one_row_f32>( 30 | src_view: &impl ImageView, 31 | dst_row: &mut [T], 32 | mut src_x: usize, 33 | coeffs_chunk: CoefficientsChunk, 34 | ) { 35 | let mut c_buf = [0f64; 4]; 36 | let mut dst_f32 = T::components_mut(dst_row); 37 | 38 | let mut dst_chunks = dst_f32.chunks_exact_mut(32); 39 | for dst_chunk in &mut dst_chunks { 40 | multiply_components_of_rows::<_, 8>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf); 41 | src_x += 32; 42 | } 43 | 44 | dst_f32 = dst_chunks.into_remainder(); 45 | dst_chunks = dst_f32.chunks_exact_mut(16); 46 | for dst_chunk in &mut dst_chunks { 47 | multiply_components_of_rows::<_, 4>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf); 48 | src_x += 16; 49 | } 50 | 51 | dst_f32 = dst_chunks.into_remainder(); 52 | dst_chunks = dst_f32.chunks_exact_mut(8); 53 | for dst_chunk in &mut dst_chunks { 54 | multiply_components_of_rows::<_, 2>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf); 55 | src_x += 8; 56 | } 57 | 58 | dst_f32 = dst_chunks.into_remainder(); 59 | if !dst_f32.is_empty() { 60 | let y_start = coeffs_chunk.start; 61 | let coeffs = coeffs_chunk.values; 62 | native::convolution_by_f32(src_view, dst_f32, src_x, y_start, coeffs); 63 | } 64 | } 65 | 66 | #[inline] 67 | #[target_feature(enable = "avx2")] 68 | unsafe fn multiply_components_of_rows, const SUMS_COUNT: usize>( 69 | src_view: &impl ImageView, 70 | src_x: usize, 71 | coeffs_chunk: CoefficientsChunk, 72 | dst_chunk: &mut [f32], 73 | c_buf: &mut [f64; 4], 74 | ) { 75 | let mut sums = [_mm256_set1_pd(0.); SUMS_COUNT]; 76 | let y_start = coeffs_chunk.start; 77 | let mut coeffs = coeffs_chunk.values; 78 | let mut y: u32 = 0; 79 | let max_rows = coeffs.len() as u32; 80 | 81 | let coeffs_2 = coeffs.chunks_exact(2); 82 | coeffs = coeffs_2.remainder(); 83 | for (src_rows, two_coeffs) in src_view.iter_2_rows(y_start, max_rows).zip(coeffs_2) { 84 | let src_rows = src_rows.map(|row| T::components(row).get_unchecked(src_x..)); 85 | for (&coeff, src_row) in two_coeffs.iter().zip(src_rows) { 86 | multiply_components_of_row(&mut sums, coeff, src_row); 87 | } 88 | y += 2; 89 | } 90 | 91 | if let Some(&coeff) = coeffs.first() { 92 | if let Some(s_row) = src_view.iter_rows(y_start + y).next() { 93 | let src_row = T::components(s_row).get_unchecked(src_x..); 94 | multiply_components_of_row(&mut sums, coeff, src_row); 95 | } 96 | } 97 | 98 | let mut dst_ptr = dst_chunk.as_mut_ptr(); 99 | for sum in sums { 100 | _mm256_storeu_pd(c_buf.as_mut_ptr(), sum); 101 | for &v in c_buf.iter() { 102 | *dst_ptr = v as f32; 103 | dst_ptr = dst_ptr.add(1); 104 | } 105 | } 106 | } 107 | 108 | #[inline] 109 | #[target_feature(enable = "avx2")] 110 | unsafe fn multiply_components_of_row( 111 | sums: &mut [__m256d; SUMS_COUNT], 112 | coeff: f64, 113 | src_row: &[f32], 114 | ) { 115 | let coeff_f64x4 = _mm256_set1_pd(coeff); 116 | let mut i = 0; 117 | while i < SUMS_COUNT { 118 | let comp07_f32x8 = simd_utils::loadu_ps256(src_row, i * 4); 119 | 120 | let comp03_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<0>(comp07_f32x8)); 121 | sums[i] = _mm256_add_pd(sums[i], _mm256_mul_pd(comp03_f64x4, coeff_f64x4)); 122 | i += 1; 123 | 124 | let comp47_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<1>(comp07_f32x8)); 125 | sums[i] = _mm256_add_pd(sums[i], _mm256_mul_pd(comp47_f64x4, coeff_f64x4)); 126 | i += 1; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/convolution/vertical_f32/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::InnerPixel; 3 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 4 | 5 | #[cfg(target_arch = "x86_64")] 6 | pub(crate) mod avx2; 7 | pub(crate) mod native; 8 | // #[cfg(target_arch = "aarch64")] 9 | // mod neon; 10 | #[cfg(target_arch = "x86_64")] 11 | pub(crate) mod sse4; 12 | // #[cfg(target_arch = "wasm32")] 13 | // pub mod wasm32; 14 | 15 | pub(crate) fn vert_convolution_f32>( 16 | src_view: &impl ImageView, 17 | dst_view: &mut impl ImageViewMut, 18 | offset: u32, 19 | coeffs: &Coefficients, 20 | cpu_extensions: CpuExtensions, 21 | ) { 22 | // Check safety conditions 23 | debug_assert!(src_view.width() - offset >= dst_view.width()); 24 | debug_assert_eq!(coeffs.bounds.len(), dst_view.height() as usize); 25 | 26 | match cpu_extensions { 27 | #[cfg(target_arch = "x86_64")] 28 | CpuExtensions::Avx2 => avx2::vert_convolution(src_view, dst_view, offset, coeffs), 29 | #[cfg(target_arch = "x86_64")] 30 | CpuExtensions::Sse4_1 => sse4::vert_convolution(src_view, dst_view, offset, coeffs), 31 | // #[cfg(target_arch = "aarch64")] 32 | // CpuExtensions::Neon => neon::vert_convolution(src_view, dst_view, offset, coeffs), 33 | // #[cfg(target_arch = "wasm32")] 34 | // CpuExtensions::Simd128 => wasm32::vert_convolution(src_view, dst_view, offset, coeffs), 35 | _ => native::vert_convolution(src_view, dst_view, offset, coeffs), 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/convolution/vertical_f32/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::Coefficients; 2 | use crate::pixels::InnerPixel; 3 | use crate::utils::foreach_with_pre_reading; 4 | use crate::{ImageView, ImageViewMut}; 5 | 6 | #[inline(always)] 7 | pub(crate) fn vert_convolution( 8 | src_view: &impl ImageView, 9 | dst_view: &mut impl ImageViewMut, 10 | offset: u32, 11 | coeffs: &Coefficients, 12 | ) where 13 | T: InnerPixel, 14 | { 15 | let coefficients_chunks = coeffs.get_chunks(); 16 | let src_x_initial = offset as usize * T::count_of_components(); 17 | 18 | let dst_rows = dst_view.iter_rows_mut(0); 19 | let coeffs_chunks_iter = coefficients_chunks.into_iter(); 20 | for (coeffs_chunk, dst_row) in coeffs_chunks_iter.zip(dst_rows) { 21 | let first_y_src = coeffs_chunk.start; 22 | let ks = coeffs_chunk.values; 23 | let mut dst_components = T::components_mut(dst_row); 24 | let mut x_src = src_x_initial; 25 | 26 | #[cfg(target_arch = "aarch64")] 27 | { 28 | (dst_components, x_src) = 29 | convolution_by_chunks::<_, 16>(src_view, dst_components, x_src, first_y_src, ks); 30 | } 31 | 32 | #[cfg(not(target_arch = "wasm32"))] 33 | { 34 | if !dst_components.is_empty() { 35 | (dst_components, x_src) = 36 | convolution_by_chunks::<_, 8>(src_view, dst_components, x_src, first_y_src, ks); 37 | } 38 | } 39 | 40 | #[cfg(target_arch = "wasm32")] 41 | { 42 | if !dst_components.is_empty() { 43 | (dst_components, x_src) = 44 | crate::convolution::vertical_f32::native::convolution_by_chunks::<_, 4>( 45 | src_view, 46 | dst_components, 47 | x_src, 48 | first_y_src, 49 | ks, 50 | ); 51 | } 52 | } 53 | 54 | if !dst_components.is_empty() { 55 | convolution_by_f32(src_view, dst_components, x_src, first_y_src, ks); 56 | } 57 | } 58 | } 59 | 60 | #[inline(always)] 61 | pub(crate) fn convolution_by_f32>( 62 | src_view: &impl ImageView, 63 | dst_components: &mut [f32], 64 | mut x_src: usize, 65 | first_y_src: u32, 66 | ks: &[f64], 67 | ) -> usize { 68 | for dst_component in dst_components.iter_mut() { 69 | let mut ss = 0.; 70 | let src_rows = src_view.iter_rows(first_y_src); 71 | for (&k, src_row) in ks.iter().zip(src_rows) { 72 | // SAFETY: Alignment of src_row is greater or equal than alignment f32 73 | // because a component of pixel type T is f32. 74 | let src_ptr = src_row.as_ptr() as *const f32; 75 | let src_component = unsafe { *src_ptr.add(x_src) }; 76 | ss += src_component as f64 * k; 77 | } 78 | *dst_component = ss as f32; 79 | x_src += 1 80 | } 81 | x_src 82 | } 83 | 84 | #[inline(always)] 85 | fn convolution_by_chunks<'a, T, const CHUNK_SIZE: usize>( 86 | src_view: &impl ImageView, 87 | dst_components: &'a mut [f32], 88 | mut x_src: usize, 89 | first_y_src: u32, 90 | ks: &[f64], 91 | ) -> (&'a mut [f32], usize) 92 | where 93 | T: InnerPixel, 94 | { 95 | let mut dst_chunks = dst_components.chunks_exact_mut(CHUNK_SIZE); 96 | 97 | for dst_chunk in &mut dst_chunks { 98 | let mut ss = [0.; CHUNK_SIZE]; 99 | let src_rows = src_view.iter_rows(first_y_src); 100 | 101 | foreach_with_pre_reading( 102 | ks.iter().zip(src_rows), 103 | |(&k, src_row)| { 104 | let src_ptr = src_row.as_ptr() as *const f32; 105 | let src_chunk = unsafe { 106 | let ptr = src_ptr.add(x_src) as *const [f32; CHUNK_SIZE]; 107 | ptr.read_unaligned() 108 | }; 109 | (src_chunk, k) 110 | }, 111 | |(src_chunk, k)| { 112 | for (s, c) in ss.iter_mut().zip(src_chunk) { 113 | *s += c as f64 * k; 114 | } 115 | }, 116 | ); 117 | 118 | for (i, s) in ss.iter().copied().enumerate() { 119 | dst_chunk[i] = s as f32; 120 | } 121 | x_src += CHUNK_SIZE; 122 | } 123 | 124 | (dst_chunks.into_remainder(), x_src) 125 | } 126 | -------------------------------------------------------------------------------- /src/convolution/vertical_f32/sse4.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | 3 | use crate::convolution::{Coefficients, CoefficientsChunk}; 4 | use crate::pixels::InnerPixel; 5 | use crate::{simd_utils, ImageView, ImageViewMut}; 6 | 7 | use super::native; 8 | 9 | pub(crate) fn vert_convolution( 10 | src_view: &impl ImageView, 11 | dst_view: &mut impl ImageViewMut, 12 | offset: u32, 13 | coeffs: &Coefficients, 14 | ) where 15 | T: InnerPixel, 16 | { 17 | let coefficients_chunks = coeffs.get_chunks(); 18 | let src_x = offset as usize * T::count_of_components(); 19 | 20 | let dst_rows = dst_view.iter_rows_mut(0); 21 | for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) { 22 | unsafe { 23 | vert_convolution_into_one_row_f32(src_view, dst_row, src_x, coeffs_chunk); 24 | } 25 | } 26 | } 27 | 28 | #[target_feature(enable = "sse4.1")] 29 | unsafe fn vert_convolution_into_one_row_f32>( 30 | src_view: &impl ImageView, 31 | dst_row: &mut [T], 32 | mut src_x: usize, 33 | coeffs_chunk: CoefficientsChunk, 34 | ) { 35 | let mut c_buf = [0f64; 2]; 36 | let mut dst_f32 = T::components_mut(dst_row); 37 | 38 | let mut dst_chunks = dst_f32.chunks_exact_mut(16); 39 | for dst_chunk in &mut dst_chunks { 40 | multiply_components_of_rows::<_, 8>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf); 41 | src_x += 16; 42 | } 43 | 44 | dst_f32 = dst_chunks.into_remainder(); 45 | dst_chunks = dst_f32.chunks_exact_mut(8); 46 | for dst_chunk in &mut dst_chunks { 47 | multiply_components_of_rows::<_, 4>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf); 48 | src_x += 8; 49 | } 50 | 51 | dst_f32 = dst_chunks.into_remainder(); 52 | dst_chunks = dst_f32.chunks_exact_mut(4); 53 | if let Some(dst_chunk) = dst_chunks.next() { 54 | multiply_components_of_rows::<_, 2>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf); 55 | src_x += 4; 56 | } 57 | 58 | dst_f32 = dst_chunks.into_remainder(); 59 | if !dst_f32.is_empty() { 60 | let y_start = coeffs_chunk.start; 61 | let coeffs = coeffs_chunk.values; 62 | native::convolution_by_f32(src_view, dst_f32, src_x, y_start, coeffs); 63 | } 64 | } 65 | 66 | #[inline] 67 | #[target_feature(enable = "sse4.1")] 68 | pub(crate) unsafe fn multiply_components_of_rows< 69 | T: InnerPixel, 70 | const SUMS_COUNT: usize, 71 | >( 72 | src_view: &impl ImageView, 73 | src_x: usize, 74 | coeffs_chunk: CoefficientsChunk, 75 | dst_chunk: &mut [f32], 76 | c_buf: &mut [f64; 2], 77 | ) { 78 | let mut sums = [_mm_set1_pd(0.); SUMS_COUNT]; 79 | let y_start = coeffs_chunk.start; 80 | let mut coeffs = coeffs_chunk.values; 81 | let mut y: u32 = 0; 82 | let max_rows = coeffs.len() as u32; 83 | 84 | let coeffs_2 = coeffs.chunks_exact(2); 85 | coeffs = coeffs_2.remainder(); 86 | for (src_rows, two_coeffs) in src_view.iter_2_rows(y_start, max_rows).zip(coeffs_2) { 87 | let src_rows = src_rows.map(|row| T::components(row).get_unchecked(src_x..)); 88 | for (&coeff, src_row) in two_coeffs.iter().zip(src_rows) { 89 | multiply_components_of_row(&mut sums, coeff, src_row); 90 | } 91 | y += 2; 92 | } 93 | 94 | if let Some(&coeff) = coeffs.first() { 95 | if let Some(s_row) = src_view.iter_rows(y_start + y).next() { 96 | let src_row = T::components(s_row).get_unchecked(src_x..); 97 | multiply_components_of_row(&mut sums, coeff, src_row); 98 | } 99 | } 100 | 101 | let mut dst_ptr = dst_chunk.as_mut_ptr(); 102 | for sum in sums { 103 | _mm_storeu_pd(c_buf.as_mut_ptr(), sum); 104 | for &v in c_buf.iter() { 105 | *dst_ptr = v as f32; 106 | dst_ptr = dst_ptr.add(1); 107 | } 108 | } 109 | } 110 | 111 | #[inline] 112 | #[target_feature(enable = "sse4.1")] 113 | unsafe fn multiply_components_of_row( 114 | sums: &mut [__m128d; SUMS_COUNT], 115 | coeff: f64, 116 | src_row: &[f32], 117 | ) { 118 | let coeff_f64x2 = _mm_set1_pd(coeff); 119 | let mut i = 0; 120 | while i < SUMS_COUNT { 121 | let comp03_f32x4 = simd_utils::loadu_ps(src_row, i * 2); 122 | 123 | let comp01_f64x2 = _mm_cvtps_pd(comp03_f32x4); 124 | sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(comp01_f64x2, coeff_f64x2)); 125 | i += 1; 126 | 127 | let comp23_f64x2 = _mm_cvtps_pd(_mm_movehl_ps(comp03_f32x4, comp03_f32x4)); 128 | sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(comp23_f64x2, coeff_f64x2)); 129 | i += 1; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/convolution/vertical_u16/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer32; 2 | use crate::pixels::InnerPixel; 3 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 4 | 5 | #[cfg(target_arch = "x86_64")] 6 | pub(crate) mod avx2; 7 | pub(crate) mod native; 8 | #[cfg(target_arch = "aarch64")] 9 | mod neon; 10 | #[cfg(target_arch = "x86_64")] 11 | pub(crate) mod sse4; 12 | #[cfg(target_arch = "wasm32")] 13 | pub mod wasm32; 14 | 15 | pub(crate) fn vert_convolution_u16>( 16 | src_view: &impl ImageView, 17 | dst_view: &mut impl ImageViewMut, 18 | offset: u32, 19 | normalizer: &Normalizer32, 20 | cpu_extensions: CpuExtensions, 21 | ) { 22 | // Check safety conditions 23 | debug_assert!(src_view.width() - offset >= dst_view.width()); 24 | debug_assert_eq!(normalizer.chunks_len(), dst_view.height() as usize); 25 | 26 | match cpu_extensions { 27 | #[cfg(target_arch = "x86_64")] 28 | CpuExtensions::Avx2 => avx2::vert_convolution(src_view, dst_view, offset, normalizer), 29 | #[cfg(target_arch = "x86_64")] 30 | CpuExtensions::Sse4_1 => sse4::vert_convolution(src_view, dst_view, offset, normalizer), 31 | #[cfg(target_arch = "aarch64")] 32 | CpuExtensions::Neon => neon::vert_convolution(src_view, dst_view, offset, normalizer), 33 | #[cfg(target_arch = "wasm32")] 34 | CpuExtensions::Simd128 => wasm32::vert_convolution(src_view, dst_view, offset, normalizer), 35 | _ => native::vert_convolution(src_view, dst_view, offset, normalizer), 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/convolution/vertical_u16/native.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer32; 2 | use crate::pixels::InnerPixel; 3 | use crate::utils::foreach_with_pre_reading; 4 | use crate::{ImageView, ImageViewMut}; 5 | 6 | #[inline(always)] 7 | pub(crate) fn vert_convolution( 8 | src_view: &impl ImageView, 9 | dst_view: &mut impl ImageViewMut, 10 | offset: u32, 11 | normalizer: &Normalizer32, 12 | ) where 13 | T: InnerPixel, 14 | { 15 | let coefficients_chunks = normalizer.chunks(); 16 | let precision = normalizer.precision(); 17 | let initial: i64 = 1 << (precision - 1); 18 | let src_x_initial = offset as usize * T::count_of_components(); 19 | 20 | let dst_rows = dst_view.iter_rows_mut(0); 21 | let coeffs_chunks_iter = coefficients_chunks.iter(); 22 | for (coeffs_chunk, dst_row) in coeffs_chunks_iter.zip(dst_rows) { 23 | let first_y_src = coeffs_chunk.start; 24 | let ks = coeffs_chunk.values(); 25 | let dst_components = T::components_mut(dst_row); 26 | let mut x_src = src_x_initial; 27 | 28 | let (_, dst_chunks, tail) = unsafe { dst_components.align_to_mut::<[u16; 16]>() }; 29 | x_src = convolution_by_chunks( 30 | src_view, 31 | normalizer, 32 | initial, 33 | dst_chunks, 34 | x_src, 35 | first_y_src, 36 | ks, 37 | ); 38 | 39 | if !tail.is_empty() { 40 | convolution_by_u16(src_view, normalizer, initial, tail, x_src, first_y_src, ks); 41 | } 42 | } 43 | } 44 | 45 | #[inline(always)] 46 | pub(crate) fn convolution_by_u16>( 47 | src_view: &impl ImageView, 48 | normalizer: &Normalizer32, 49 | initial: i64, 50 | dst_components: &mut [u16], 51 | mut x_src: usize, 52 | first_y_src: u32, 53 | ks: &[i32], 54 | ) -> usize { 55 | for dst_component in dst_components.iter_mut() { 56 | let mut ss = initial; 57 | let src_rows = src_view.iter_rows(first_y_src); 58 | for (&k, src_row) in ks.iter().zip(src_rows) { 59 | // SAFETY: Alignment of src_row is greater or equal than alignment u16 60 | // because one component of pixel type T is u16. 61 | let src_ptr = src_row.as_ptr() as *const u16; 62 | let src_component = unsafe { *src_ptr.add(x_src) }; 63 | ss += src_component as i64 * (k as i64); 64 | } 65 | *dst_component = normalizer.clip(ss); 66 | x_src += 1 67 | } 68 | x_src 69 | } 70 | 71 | #[inline(always)] 72 | fn convolution_by_chunks( 73 | src_view: &impl ImageView, 74 | normalizer: &Normalizer32, 75 | initial: i64, 76 | dst_chunks: &mut [[u16; CHUNK_SIZE]], 77 | mut x_src: usize, 78 | first_y_src: u32, 79 | ks: &[i32], 80 | ) -> usize 81 | where 82 | T: InnerPixel, 83 | { 84 | for dst_chunk in dst_chunks { 85 | let mut ss = [initial; CHUNK_SIZE]; 86 | let src_rows = src_view.iter_rows(first_y_src); 87 | 88 | foreach_with_pre_reading( 89 | ks.iter().zip(src_rows), 90 | |(&k, src_row)| { 91 | let src_ptr = src_row.as_ptr() as *const u16; 92 | let src_chunk = unsafe { 93 | let ptr = src_ptr.add(x_src) as *const [u16; CHUNK_SIZE]; 94 | ptr.read_unaligned() 95 | }; 96 | (src_chunk, k) 97 | }, 98 | |(src_chunk, k)| { 99 | for (s, c) in ss.iter_mut().zip(src_chunk) { 100 | *s += c as i64 * (k as i64); 101 | } 102 | }, 103 | ); 104 | 105 | for (i, s) in ss.iter().copied().enumerate() { 106 | dst_chunk[i] = normalizer.clip(s); 107 | } 108 | x_src += CHUNK_SIZE; 109 | } 110 | x_src 111 | } 112 | -------------------------------------------------------------------------------- /src/convolution/vertical_u8/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::convolution::optimisations::Normalizer16; 2 | use crate::pixels::InnerPixel; 3 | use crate::{CpuExtensions, ImageView, ImageViewMut}; 4 | 5 | #[cfg(target_arch = "x86_64")] 6 | pub(crate) mod avx2; 7 | pub(crate) mod native; 8 | #[cfg(target_arch = "aarch64")] 9 | mod neon; 10 | #[cfg(target_arch = "x86_64")] 11 | pub(crate) mod sse4; 12 | #[cfg(target_arch = "wasm32")] 13 | pub(crate) mod wasm32; 14 | 15 | pub(crate) fn vert_convolution_u8>( 16 | src_view: &impl ImageView, 17 | dst_view: &mut impl ImageViewMut, 18 | offset: u32, 19 | normalizer: &Normalizer16, 20 | cpu_extensions: CpuExtensions, 21 | ) { 22 | // Check safety conditions 23 | debug_assert!(src_view.width() - offset >= dst_view.width()); 24 | debug_assert_eq!(normalizer.chunks_len(), dst_view.height() as usize); 25 | 26 | match cpu_extensions { 27 | #[cfg(target_arch = "x86_64")] 28 | CpuExtensions::Avx2 => avx2::vert_convolution(src_view, dst_view, offset, normalizer), 29 | #[cfg(target_arch = "x86_64")] 30 | CpuExtensions::Sse4_1 => sse4::vert_convolution(src_view, dst_view, offset, normalizer), 31 | #[cfg(target_arch = "aarch64")] 32 | CpuExtensions::Neon => neon::vert_convolution(src_view, dst_view, offset, normalizer), 33 | #[cfg(target_arch = "wasm32")] 34 | CpuExtensions::Simd128 => wasm32::vert_convolution(src_view, dst_view, offset, normalizer), 35 | _ => native::vert_convolution(src_view, dst_view, offset, normalizer), 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/cpu_extensions.rs: -------------------------------------------------------------------------------- 1 | /// SIMD extension of CPU. 2 | /// Specific variants depend on target architecture. 3 | /// Look at source code to see all available variants. 4 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 5 | pub enum CpuExtensions { 6 | None, 7 | #[cfg(target_arch = "x86_64")] 8 | /// SIMD extension of x86_64 architecture 9 | Sse4_1, 10 | #[cfg(target_arch = "x86_64")] 11 | /// SIMD extension of x86_64 architecture 12 | Avx2, 13 | #[cfg(target_arch = "aarch64")] 14 | /// SIMD extension of Arm64 architecture 15 | Neon, 16 | #[cfg(target_arch = "wasm32")] 17 | /// SIMD extension of Wasm32 architecture 18 | Simd128, 19 | } 20 | 21 | impl CpuExtensions { 22 | /// Returns `true` if your CPU support the extension. 23 | pub fn is_supported(&self) -> bool { 24 | match self { 25 | #[cfg(target_arch = "x86_64")] 26 | Self::Avx2 => is_x86_feature_detected!("avx2"), 27 | #[cfg(target_arch = "x86_64")] 28 | Self::Sse4_1 => is_x86_feature_detected!("sse4.1"), 29 | #[cfg(target_arch = "aarch64")] 30 | Self::Neon => std::arch::is_aarch64_feature_detected!("neon"), 31 | #[cfg(target_arch = "wasm32")] 32 | Self::Simd128 => true, 33 | Self::None => true, 34 | } 35 | } 36 | } 37 | 38 | impl Default for CpuExtensions { 39 | #[cfg(target_arch = "x86_64")] 40 | fn default() -> Self { 41 | if is_x86_feature_detected!("avx2") { 42 | Self::Avx2 43 | } else if is_x86_feature_detected!("sse4.1") { 44 | Self::Sse4_1 45 | } else { 46 | Self::None 47 | } 48 | } 49 | 50 | #[cfg(target_arch = "aarch64")] 51 | fn default() -> Self { 52 | if std::arch::is_aarch64_feature_detected!("neon") { 53 | Self::Neon 54 | } else { 55 | Self::None 56 | } 57 | } 58 | #[cfg(target_arch = "wasm32")] 59 | fn default() -> Self { 60 | Self::Simd128 61 | } 62 | 63 | #[cfg(not(any( 64 | target_arch = "x86_64", 65 | target_arch = "aarch64", 66 | target_arch = "wasm32" 67 | )))] 68 | fn default() -> Self { 69 | Self::None 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)] 4 | #[non_exhaustive] 5 | pub enum ImageError { 6 | #[error("Pixel type of image is not supported")] 7 | UnsupportedPixelType, 8 | } 9 | 10 | #[derive(Error, Debug, Clone, Copy)] 11 | #[error("Size of container with pixels is smaller than required")] 12 | pub struct InvalidPixelsSize; 13 | 14 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)] 15 | pub enum ImageBufferError { 16 | #[error("Size of buffer is smaller than required")] 17 | InvalidBufferSize, 18 | #[error("Alignment of buffer don't match to alignment of required pixel type")] 19 | InvalidBufferAlignment, 20 | } 21 | 22 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)] 23 | pub enum CropBoxError { 24 | #[error("Position of the crop box is out of the image boundaries")] 25 | PositionIsOutOfImageBoundaries, 26 | #[error("Size of the crop box is out of the image boundaries")] 27 | SizeIsOutOfImageBoundaries, 28 | #[error("Width or height of the crop box is less than zero")] 29 | WidthOrHeightLessThanZero, 30 | } 31 | 32 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)] 33 | #[non_exhaustive] 34 | pub enum ResizeError { 35 | #[error("Source or destination image is not supported")] 36 | ImageError(#[from] ImageError), 37 | #[error("Pixel type of source image does not match to destination image")] 38 | PixelTypesAreDifferent, 39 | #[error("Source cropping option is invalid: {0}")] 40 | SrcCroppingError(#[from] CropBoxError), 41 | } 42 | 43 | #[derive(Error, Debug, Clone, Copy)] 44 | #[error( 45 | "The dimensions of the source image are not equal to the dimensions of the destination image" 46 | )] 47 | pub struct DifferentDimensionsError; 48 | 49 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)] 50 | pub enum MappingError { 51 | #[error("Source or destination image is not supported")] 52 | ImageError(#[from] ImageError), 53 | #[error("The dimensions of the source image are not equal to the dimensions of the destination image")] 54 | DifferentDimensions, 55 | #[error("Unsupported combination of pixels of source and/or destination images")] 56 | UnsupportedCombinationOfImageTypes, 57 | } 58 | 59 | impl From for MappingError { 60 | fn from(_: DifferentDimensionsError) -> Self { 61 | MappingError::DifferentDimensions 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/images/cropped_image.rs: -------------------------------------------------------------------------------- 1 | use crate::images::{check_crop_box, TypedCroppedImage, TypedCroppedImageMut}; 2 | use crate::{ 3 | CropBoxError, ImageView, ImageViewMut, IntoImageView, IntoImageViewMut, PixelTrait, PixelType, 4 | }; 5 | 6 | /// It is a wrapper that provides [IntoImageView] for part of wrapped image. 7 | pub struct CroppedImage<'a, V: IntoImageView> { 8 | image: &'a V, 9 | left: u32, 10 | top: u32, 11 | width: u32, 12 | height: u32, 13 | } 14 | 15 | /// It is a wrapper that provides [IntoImageView] and [IntoImageViewMut] for part of wrapped image. 16 | pub struct CroppedImageMut<'a, V: IntoImageView> { 17 | image: &'a mut V, 18 | left: u32, 19 | top: u32, 20 | width: u32, 21 | height: u32, 22 | } 23 | 24 | impl<'a, V: IntoImageView> CroppedImage<'a, V> { 25 | pub fn new( 26 | image: &'a V, 27 | left: u32, 28 | top: u32, 29 | width: u32, 30 | height: u32, 31 | ) -> Result { 32 | check_crop_box(image.width(), image.height(), left, top, width, height)?; 33 | Ok(Self { 34 | image, 35 | left, 36 | top, 37 | width, 38 | height, 39 | }) 40 | } 41 | } 42 | 43 | impl<'a, V: IntoImageView> CroppedImageMut<'a, V> { 44 | pub fn new( 45 | image: &'a mut V, 46 | left: u32, 47 | top: u32, 48 | width: u32, 49 | height: u32, 50 | ) -> Result { 51 | check_crop_box(image.width(), image.height(), left, top, width, height)?; 52 | Ok(Self { 53 | image, 54 | left, 55 | top, 56 | width, 57 | height, 58 | }) 59 | } 60 | } 61 | 62 | impl<'a, V: IntoImageView> IntoImageView for CroppedImage<'a, V> { 63 | fn pixel_type(&self) -> Option { 64 | self.image.pixel_type() 65 | } 66 | 67 | fn width(&self) -> u32 { 68 | self.width 69 | } 70 | 71 | fn height(&self) -> u32 { 72 | self.height 73 | } 74 | 75 | fn image_view(&self) -> Option> { 76 | self.image.image_view().map(|v| { 77 | TypedCroppedImage::new(v, self.left, self.top, self.width, self.height).unwrap() 78 | }) 79 | } 80 | } 81 | 82 | impl<'a, V: IntoImageView> IntoImageView for CroppedImageMut<'a, V> { 83 | fn pixel_type(&self) -> Option { 84 | self.image.pixel_type() 85 | } 86 | 87 | fn width(&self) -> u32 { 88 | self.width 89 | } 90 | 91 | fn height(&self) -> u32 { 92 | self.height 93 | } 94 | 95 | fn image_view(&self) -> Option> { 96 | self.image.image_view().map(|v| { 97 | TypedCroppedImage::new(v, self.left, self.top, self.width, self.height).unwrap() 98 | }) 99 | } 100 | } 101 | 102 | impl<'a, V: IntoImageViewMut> IntoImageViewMut for CroppedImageMut<'a, V> { 103 | fn image_view_mut(&mut self) -> Option> { 104 | self.image.image_view_mut().map(|v| { 105 | TypedCroppedImageMut::new(v, self.left, self.top, self.width, self.height).unwrap() 106 | }) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/images/image_crate.rs: -------------------------------------------------------------------------------- 1 | use std::ops::DerefMut; 2 | 3 | use crate::image_view::try_pixel_type; 4 | use crate::images::{TypedImage, TypedImageRef}; 5 | use crate::{ImageView, ImageViewMut, IntoImageView, IntoImageViewMut, PixelTrait, PixelType}; 6 | use bytemuck::cast_slice_mut; 7 | use image::DynamicImage; 8 | 9 | impl IntoImageView for DynamicImage { 10 | fn pixel_type(&self) -> Option { 11 | match self { 12 | DynamicImage::ImageLuma8(_) => Some(PixelType::U8), 13 | DynamicImage::ImageLumaA8(_) => Some(PixelType::U8x2), 14 | DynamicImage::ImageRgb8(_) => Some(PixelType::U8x3), 15 | DynamicImage::ImageRgba8(_) => Some(PixelType::U8x4), 16 | DynamicImage::ImageLuma16(_) => Some(PixelType::U16), 17 | DynamicImage::ImageLumaA16(_) => Some(PixelType::U16x2), 18 | DynamicImage::ImageRgb16(_) => Some(PixelType::U16x3), 19 | DynamicImage::ImageRgba16(_) => Some(PixelType::U16x4), 20 | _ => None, 21 | } 22 | } 23 | 24 | fn width(&self) -> u32 { 25 | self.width() 26 | } 27 | 28 | fn height(&self) -> u32 { 29 | self.height() 30 | } 31 | 32 | fn image_view(&self) -> Option> { 33 | if let Ok(pixel_type) = try_pixel_type(self) { 34 | if P::pixel_type() == pixel_type { 35 | return TypedImageRef::

::from_buffer( 36 | self.width(), 37 | self.height(), 38 | self.as_bytes(), 39 | ) 40 | .ok(); 41 | } 42 | } 43 | None 44 | } 45 | } 46 | 47 | impl IntoImageViewMut for DynamicImage { 48 | fn image_view_mut(&mut self) -> Option> { 49 | if let Ok(pixel_type) = try_pixel_type(self) { 50 | if P::pixel_type() == pixel_type { 51 | return TypedImage::

::from_buffer( 52 | self.width(), 53 | self.height(), 54 | image_as_bytes_mut(self), 55 | ) 56 | .ok(); 57 | } 58 | } 59 | None 60 | } 61 | } 62 | 63 | fn image_as_bytes_mut(image: &mut DynamicImage) -> &mut [u8] { 64 | match image { 65 | DynamicImage::ImageLuma8(img) => (*img).deref_mut(), 66 | DynamicImage::ImageLumaA8(img) => (*img).deref_mut(), 67 | DynamicImage::ImageRgb8(img) => (*img).deref_mut(), 68 | DynamicImage::ImageRgba8(img) => (*img).deref_mut(), 69 | DynamicImage::ImageLuma16(img) => cast_slice_mut((*img).deref_mut()), 70 | DynamicImage::ImageLumaA16(img) => cast_slice_mut((*img).deref_mut()), 71 | DynamicImage::ImageRgb16(img) => cast_slice_mut((*img).deref_mut()), 72 | DynamicImage::ImageRgba16(img) => cast_slice_mut((*img).deref_mut()), 73 | _ => &mut [], 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/images/mod.rs: -------------------------------------------------------------------------------- 1 | //! Contains different types of images and wrappers for them. 2 | use std::fmt::Debug; 3 | 4 | pub use cropped_image::*; 5 | pub use image::*; 6 | pub use typed_cropped_image::*; 7 | pub use typed_image::*; 8 | pub(crate) use unsafe_image::UnsafeImageMut; 9 | 10 | mod cropped_image; 11 | mod image; 12 | mod typed_cropped_image; 13 | mod typed_image; 14 | mod unsafe_image; 15 | 16 | #[cfg(feature = "image")] 17 | mod image_crate; 18 | 19 | #[derive(Debug)] 20 | enum BufferContainer<'a, T: Copy + Debug> { 21 | Borrowed(&'a mut [T]), 22 | Owned(Vec), 23 | } 24 | 25 | impl<'a, T: Copy + Debug> BufferContainer<'a, T> { 26 | fn as_vec(&self) -> Vec { 27 | match self { 28 | Self::Borrowed(slice) => slice.to_vec(), 29 | Self::Owned(vec) => vec.clone(), 30 | } 31 | } 32 | 33 | pub fn borrow(&self) -> &[T] { 34 | match self { 35 | Self::Borrowed(p_ref) => p_ref, 36 | Self::Owned(vec) => vec, 37 | } 38 | } 39 | 40 | pub fn borrow_mut(&mut self) -> &mut [T] { 41 | match self { 42 | Self::Borrowed(p_ref) => p_ref, 43 | Self::Owned(vec) => vec, 44 | } 45 | } 46 | } 47 | 48 | enum View<'a, V: 'a> { 49 | Borrowed(&'a V), 50 | Owned(V), 51 | } 52 | 53 | impl<'a, V> View<'a, V> { 54 | fn get_ref(&self) -> &V { 55 | match self { 56 | Self::Borrowed(v_ref) => v_ref, 57 | Self::Owned(v_own) => v_own, 58 | } 59 | } 60 | } 61 | 62 | enum ViewMut<'a, V: 'a> { 63 | Borrowed(&'a mut V), 64 | Owned(V), 65 | } 66 | 67 | impl<'a, V> ViewMut<'a, V> { 68 | fn get_ref(&self) -> &V { 69 | match self { 70 | Self::Borrowed(v_ref) => v_ref, 71 | Self::Owned(v_own) => v_own, 72 | } 73 | } 74 | 75 | fn get_mut(&mut self) -> &mut V { 76 | match self { 77 | Self::Borrowed(p_ref) => p_ref, 78 | Self::Owned(vec) => vec, 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/images/unsafe_image.rs: -------------------------------------------------------------------------------- 1 | use crate::{ArrayChunks, ImageView, ImageViewMut}; 2 | use std::marker::PhantomData; 3 | use std::num::NonZeroU32; 4 | 5 | #[derive(Copy)] 6 | pub(crate) struct UnsafeImageMut<'a, V> 7 | where 8 | V: ImageViewMut, 9 | { 10 | image: std::ptr::NonNull, 11 | p: PhantomData<&'a V>, 12 | } 13 | 14 | impl<'a, V> Clone for UnsafeImageMut<'a, V> 15 | where 16 | V: ImageViewMut, 17 | { 18 | fn clone(&self) -> Self { 19 | Self { 20 | image: self.image, 21 | p: PhantomData, 22 | } 23 | } 24 | } 25 | 26 | unsafe impl<'a, V: ImageViewMut> Send for UnsafeImageMut<'a, V> {} 27 | unsafe impl<'a, V: ImageViewMut> Sync for UnsafeImageMut<'a, V> {} 28 | 29 | impl<'a, V: ImageViewMut> UnsafeImageMut<'a, V> { 30 | pub fn new(image: &'a mut V) -> Self { 31 | let ptr = std::ptr::NonNull::new(image as *mut V).unwrap(); 32 | Self { 33 | image: ptr, 34 | p: PhantomData, 35 | } 36 | } 37 | 38 | fn get(&self) -> &V { 39 | unsafe { self.image.as_ref() } 40 | } 41 | 42 | fn get_mut(&mut self) -> &mut V { 43 | unsafe { self.image.as_mut() } 44 | } 45 | } 46 | 47 | unsafe impl<'a, V: ImageViewMut> ImageView for UnsafeImageMut<'a, V> { 48 | type Pixel = V::Pixel; 49 | 50 | fn width(&self) -> u32 { 51 | self.get().width() 52 | } 53 | 54 | fn height(&self) -> u32 { 55 | self.get().height() 56 | } 57 | 58 | fn iter_rows(&self, start_row: u32) -> impl Iterator { 59 | self.get().iter_rows(start_row) 60 | } 61 | 62 | fn iter_2_rows( 63 | &self, 64 | start_y: u32, 65 | max_rows: u32, 66 | ) -> ArrayChunks, 2> { 67 | self.get().iter_2_rows(start_y, max_rows) 68 | } 69 | 70 | fn iter_4_rows( 71 | &self, 72 | start_y: u32, 73 | max_rows: u32, 74 | ) -> ArrayChunks, 4> { 75 | self.get().iter_4_rows(start_y, max_rows) 76 | } 77 | 78 | fn iter_rows_with_step( 79 | &self, 80 | start_y: f64, 81 | step: f64, 82 | max_rows: u32, 83 | ) -> impl Iterator { 84 | self.get().iter_rows_with_step(start_y, step, max_rows) 85 | } 86 | 87 | fn split_by_height( 88 | &self, 89 | start_row: u32, 90 | height: NonZeroU32, 91 | num_parts: NonZeroU32, 92 | ) -> Option>> { 93 | self.get().split_by_height(start_row, height, num_parts) 94 | } 95 | 96 | fn split_by_width( 97 | &self, 98 | start_col: u32, 99 | width: NonZeroU32, 100 | num_parts: NonZeroU32, 101 | ) -> Option>> { 102 | self.get().split_by_width(start_col, width, num_parts) 103 | } 104 | } 105 | 106 | unsafe impl<'a, V: ImageViewMut> ImageViewMut for UnsafeImageMut<'a, V> { 107 | fn iter_rows_mut(&mut self, start_row: u32) -> impl Iterator { 108 | self.get_mut().iter_rows_mut(start_row) 109 | } 110 | 111 | fn iter_2_rows_mut(&mut self) -> ArrayChunks, 2> { 112 | self.get_mut().iter_2_rows_mut() 113 | } 114 | 115 | fn iter_4_rows_mut(&mut self) -> ArrayChunks, 4> { 116 | self.get_mut().iter_4_rows_mut() 117 | } 118 | 119 | fn split_by_height_mut( 120 | &mut self, 121 | start_row: u32, 122 | height: NonZeroU32, 123 | num_parts: NonZeroU32, 124 | ) -> Option>> { 125 | self.get_mut() 126 | .split_by_height_mut(start_row, height, num_parts) 127 | } 128 | 129 | fn split_by_width_mut( 130 | &mut self, 131 | start_col: u32, 132 | width: NonZeroU32, 133 | num_parts: NonZeroU32, 134 | ) -> Option>> { 135 | self.get_mut() 136 | .split_by_width_mut(start_col, width, num_parts) 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | //! 3 | //! ## Feature flags 4 | #![doc = document_features::document_features!()] 5 | 6 | pub use alpha::errors::*; 7 | pub use array_chunks::*; 8 | pub use change_components_type::*; 9 | pub use color::mappers::*; 10 | pub use color::PixelComponentMapper; 11 | pub use convolution::*; 12 | pub use cpu_extensions::CpuExtensions; 13 | pub use crop_box::*; 14 | pub use errors::*; 15 | pub use image_view::*; 16 | pub use mul_div::MulDiv; 17 | pub use pixels::PixelType; 18 | pub use resizer::{ResizeAlg, ResizeOptions, Resizer, SrcCropping}; 19 | 20 | use crate::alpha::AlphaMulDiv; 21 | 22 | #[macro_use] 23 | mod utils; 24 | 25 | mod alpha; 26 | mod array_chunks; 27 | mod change_components_type; 28 | mod color; 29 | mod convolution; 30 | mod cpu_extensions; 31 | mod crop_box; 32 | mod errors; 33 | mod image_view; 34 | pub mod images; 35 | mod mul_div; 36 | #[cfg(target_arch = "aarch64")] 37 | mod neon_utils; 38 | pub mod pixels; 39 | mod resizer; 40 | #[cfg(target_arch = "x86_64")] 41 | mod simd_utils; 42 | #[cfg(feature = "for_testing")] 43 | pub mod testing; 44 | #[cfg(feature = "rayon")] 45 | pub(crate) mod threading; 46 | #[cfg(target_arch = "wasm32")] 47 | mod wasm32_utils; 48 | 49 | /// A trait implemented by all pixel types from the crate. 50 | /// 51 | /// This trait must be used in your code instead of [InnerPixel](pixels::InnerPixel). 52 | #[allow(private_bounds)] 53 | pub trait PixelTrait: Convolution + AlphaMulDiv {} 54 | 55 | impl PixelTrait for P {} 56 | -------------------------------------------------------------------------------- /src/simd_utils.rs: -------------------------------------------------------------------------------- 1 | use std::arch::x86_64::*; 2 | use std::intrinsics::transmute; 3 | 4 | use crate::pixels::{U8x3, U8x4}; 5 | 6 | #[inline(always)] 7 | pub unsafe fn loadu_si128(buf: &[T], index: usize) -> __m128i { 8 | _mm_loadu_si128(buf.get_unchecked(index..).as_ptr() as *const __m128i) 9 | } 10 | 11 | #[inline(always)] 12 | pub unsafe fn loadu_si256(buf: &[T], index: usize) -> __m256i { 13 | _mm256_loadu_si256(buf.get_unchecked(index..).as_ptr() as *const __m256i) 14 | } 15 | 16 | #[inline(always)] 17 | pub unsafe fn loadl_epi16(buf: &[T], index: usize) -> __m128i { 18 | let mem_addr = buf.get_unchecked(index..).as_ptr() as *const i16; 19 | _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, mem_addr.read_unaligned()) 20 | } 21 | 22 | #[inline(always)] 23 | pub unsafe fn loadl_epi32(buf: &[T], index: usize) -> __m128i { 24 | let mem_addr = buf.get_unchecked(index..).as_ptr() as *const i32; 25 | _mm_set_epi32(0, 0, 0, mem_addr.read_unaligned()) 26 | } 27 | 28 | #[inline(always)] 29 | pub unsafe fn loadl_epi64(buf: &[T], index: usize) -> __m128i { 30 | _mm_loadl_epi64(buf.get_unchecked(index..).as_ptr() as *const __m128i) 31 | } 32 | 33 | #[inline(always)] 34 | pub unsafe fn loadu_ps(buf: &[T], index: usize) -> __m128 { 35 | _mm_loadu_ps(buf.get_unchecked(index..).as_ptr() as *const f32) 36 | } 37 | 38 | #[inline(always)] 39 | pub unsafe fn loadu_ps256(buf: &[T], index: usize) -> __m256 { 40 | _mm256_loadu_ps(buf.get_unchecked(index..).as_ptr() as *const f32) 41 | } 42 | 43 | #[inline(always)] 44 | pub unsafe fn loadu_pd(buf: &[T], index: usize) -> __m128d { 45 | _mm_loadu_pd(buf.get_unchecked(index..).as_ptr() as *const f64) 46 | } 47 | 48 | #[inline(always)] 49 | pub unsafe fn loadu_pd256(buf: &[T], index: usize) -> __m256d { 50 | _mm256_loadu_pd(buf.get_unchecked(index..).as_ptr() as *const f64) 51 | } 52 | 53 | #[inline(always)] 54 | pub unsafe fn mm_cvtepu8_epi32(buf: &[U8x4], index: usize) -> __m128i { 55 | let v: i32 = transmute(buf.get_unchecked(index).0); 56 | _mm_cvtepu8_epi32(_mm_cvtsi32_si128(v)) 57 | } 58 | 59 | #[inline(always)] 60 | pub unsafe fn mm_cvtepu8_epi32_u8x3(buf: &[U8x3], index: usize) -> __m128i { 61 | let pixel = buf.get_unchecked(index).0; 62 | let v: i32 = i32::from_le_bytes([pixel[0], pixel[1], pixel[2], 0]); 63 | _mm_cvtepu8_epi32(_mm_cvtsi32_si128(v)) 64 | } 65 | 66 | #[inline(always)] 67 | pub unsafe fn mm_cvtepu8_epi32_from_u8(buf: &[u8], index: usize) -> __m128i { 68 | let ptr = buf.get_unchecked(index..).as_ptr() as *const i32; 69 | _mm_cvtepu8_epi32(_mm_cvtsi32_si128(ptr.read_unaligned())) 70 | } 71 | 72 | #[inline(always)] 73 | pub unsafe fn mm_cvtsi32_si128_from_u8(buf: &[u8], index: usize) -> __m128i { 74 | let ptr = buf.get_unchecked(index..).as_ptr() as *const i32; 75 | _mm_cvtsi32_si128(ptr.read_unaligned()) 76 | } 77 | 78 | #[inline(always)] 79 | pub unsafe fn mm_load_and_clone_i16x2(buf: &[i16]) -> __m128i { 80 | debug_assert!(buf.len() >= 2); 81 | _mm_set1_epi32((buf.as_ptr() as *const i32).read_unaligned()) 82 | } 83 | 84 | #[inline(always)] 85 | pub unsafe fn mm256_load_and_clone_i16x2(buf: &[i16]) -> __m256i { 86 | debug_assert!(buf.len() >= 2); 87 | _mm256_set1_epi32((buf.as_ptr() as *const i32).read_unaligned()) 88 | } 89 | 90 | #[inline(always)] 91 | pub unsafe fn ptr_i16_to_set1_epi64x(buf: &[i16], index: usize) -> __m128i { 92 | _mm_set1_epi64x((buf.get_unchecked(index..).as_ptr() as *const i64).read_unaligned()) 93 | } 94 | 95 | #[inline(always)] 96 | pub unsafe fn ptr_i16_to_256set1_epi64x(buf: &[i16], index: usize) -> __m256i { 97 | _mm256_set1_epi64x((buf.get_unchecked(index..).as_ptr() as *const i64).read_unaligned()) 98 | } 99 | -------------------------------------------------------------------------------- /src/testing.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | 3 | thread_local!(static TEST_LOGS: RefCell> = const { RefCell::new(Vec::new()) }); 4 | 5 | pub fn log_message(msg: &str) { 6 | TEST_LOGS.with(|f| { 7 | let mut logs = f.borrow_mut(); 8 | logs.push(msg.to_string()); 9 | }); 10 | } 11 | 12 | pub fn logs_contain(msg: &str) -> bool { 13 | TEST_LOGS.with(|f| { 14 | let logs = f.borrow(); 15 | for line in logs.iter() { 16 | if line.contains(msg) { 17 | return true; 18 | } 19 | } 20 | false 21 | }) 22 | } 23 | 24 | pub fn clear_log() { 25 | TEST_LOGS.with(|f| { 26 | let mut logs = f.borrow_mut(); 27 | logs.clear(); 28 | }) 29 | } 30 | -------------------------------------------------------------------------------- /src/threading.rs: -------------------------------------------------------------------------------- 1 | use crate::pixels::InnerPixel; 2 | use crate::{ImageView, ImageViewMut}; 3 | use rayon::current_num_threads; 4 | use rayon::prelude::*; 5 | use std::num::NonZeroU32; 6 | 7 | #[inline] 8 | pub(crate) fn split_h_two_images_for_threading<'a, P: InnerPixel>( 9 | src_view: &'a impl ImageView, 10 | dst_view: &'a mut impl ImageViewMut, 11 | src_offset: u32, 12 | ) -> Option< 13 | impl ParallelIterator< 14 | Item = ( 15 | impl ImageView + 'a, 16 | impl ImageViewMut + 'a, 17 | ), 18 | >, 19 | > { 20 | debug_assert!(src_view.height() - src_offset >= dst_view.height()); 21 | 22 | let dst_width = dst_view.width(); 23 | let dst_height = dst_view.height(); 24 | let max_num_parts = calculate_max_h_parts_number(dst_width, dst_height); 25 | 26 | let num_threads = current_num_threads() as u32; 27 | if num_threads > 1 && max_num_parts > 1 { 28 | let num_parts = NonZeroU32::new(num_threads.min(max_num_parts)).unwrap(); 29 | let dst_height = NonZeroU32::new(dst_height).unwrap(); 30 | if let Some(src_parts) = src_view.split_by_height(src_offset, dst_height, num_parts) { 31 | if let Some(dst_parts) = dst_view.split_by_height_mut(0, dst_height, num_parts) { 32 | let src_iter = src_parts.into_par_iter(); 33 | let dst_iter = dst_parts.into_par_iter(); 34 | return Some(src_iter.zip(dst_iter)); 35 | } 36 | } 37 | } 38 | None 39 | } 40 | 41 | #[inline] 42 | pub(crate) fn split_h_one_image_for_threading( 43 | image_view: &mut impl ImageViewMut, 44 | ) -> Option + '_>> { 45 | let width = image_view.width(); 46 | let height = image_view.height(); 47 | let max_num_parts = calculate_max_h_parts_number(width, height); 48 | 49 | let num_threads = current_num_threads() as u32; 50 | if num_threads > 1 && max_num_parts > 1 { 51 | let num_parts = NonZeroU32::new(num_threads.min(max_num_parts)).unwrap(); 52 | let height = NonZeroU32::new(height).unwrap(); 53 | let img_parts = image_view.split_by_height_mut(0, height, num_parts); 54 | return img_parts.map(|parts| parts.into_par_iter()); 55 | } 56 | None 57 | } 58 | 59 | /// It is not optimal to split images on too small parts. 60 | /// We have to calculate minimal height of one part. 61 | /// For small images, it is equal to `constant / area`. 62 | /// For tall images, it is equal to `height / 256`. 63 | fn calculate_max_h_parts_number(width: u32, height: u32) -> u32 { 64 | if width == 0 || height == 0 { 65 | return 1; 66 | } 67 | let area = height * height.max(width); 68 | let min_height = ((1 << 14) / area).max(height / 256); 69 | height / min_height.max(1) 70 | } 71 | 72 | #[inline] 73 | pub(crate) fn split_v_two_images_for_threading<'a, P: InnerPixel>( 74 | src_view: &'a impl ImageView, 75 | dst_view: &'a mut impl ImageViewMut, 76 | src_offset: u32, 77 | ) -> Option< 78 | impl ParallelIterator< 79 | Item = ( 80 | impl ImageView + 'a, 81 | impl ImageViewMut + 'a, 82 | ), 83 | >, 84 | > { 85 | debug_assert!(src_view.width() - src_offset >= dst_view.width()); 86 | 87 | let dst_width = dst_view.width(); 88 | let dst_height = dst_view.height(); 89 | let max_num_parts = calculate_max_v_parts_number(dst_width, dst_height); 90 | 91 | let num_threads = current_num_threads() as u32; 92 | if num_threads > 1 && max_num_parts > 1 { 93 | let num_parts = NonZeroU32::new(num_threads.min(max_num_parts)).unwrap(); 94 | let dst_width = NonZeroU32::new(dst_width).unwrap(); 95 | if let Some(src_parts) = src_view.split_by_width(src_offset, dst_width, num_parts) { 96 | if let Some(dst_parts) = dst_view.split_by_width_mut(0, dst_width, num_parts) { 97 | let src_iter = src_parts.into_par_iter(); 98 | let dst_iter = dst_parts.into_par_iter(); 99 | return Some(src_iter.zip(dst_iter)); 100 | } 101 | } 102 | } 103 | None 104 | } 105 | 106 | /// It is not optimal to split images on too small parts. 107 | /// We have to calculate minimal width of one part. 108 | /// For small images, it is equal to `constant / area`. 109 | /// For wide images, it is equal to `width / 256`. 110 | fn calculate_max_v_parts_number(width: u32, height: u32) -> u32 { 111 | if width == 0 || height == 0 { 112 | return 1; 113 | } 114 | let area = width * height.max(width); 115 | let min_width = ((1 << 14) / area).max(width / 256); 116 | width / min_width.max(1) 117 | } 118 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | /// Pre-reading data from memory increases speed slightly for some operations 2 | #[inline(always)] 3 | pub(crate) fn foreach_with_pre_reading( 4 | mut iter: impl Iterator, 5 | mut read_data: impl FnMut(I) -> D, 6 | mut process_data: impl FnMut(D), 7 | ) { 8 | let mut next_data: D; 9 | if let Some(src) = iter.next() { 10 | next_data = read_data(src); 11 | for src in iter { 12 | let data = next_data; 13 | next_data = read_data(src); 14 | process_data(data); 15 | } 16 | process_data(next_data); 17 | } 18 | } 19 | 20 | macro_rules! test_log { 21 | ($s:expr) => { 22 | #[cfg(feature = "for_testing")] 23 | { 24 | use crate::testing::log_message; 25 | log_message($s); 26 | } 27 | }; 28 | } 29 | -------------------------------------------------------------------------------- /src/wasm32_utils.rs: -------------------------------------------------------------------------------- 1 | use std::arch::wasm32::*; 2 | 3 | use crate::pixels::{U8x3, U8x4}; 4 | 5 | #[inline] 6 | #[target_feature(enable = "simd128")] 7 | pub(crate) unsafe fn load_v128(buf: &[T], index: usize) -> v128 { 8 | v128_load(buf.get_unchecked(index..).as_ptr() as *const v128) 9 | } 10 | 11 | #[inline] 12 | #[target_feature(enable = "simd128")] 13 | pub(crate) unsafe fn loadl_i64(buf: &[T], index: usize) -> v128 { 14 | let p = buf.get_unchecked(index..).as_ptr() as *const i64; 15 | i64x2(p.read_unaligned(), 0) 16 | } 17 | 18 | #[inline] 19 | #[target_feature(enable = "simd128")] 20 | pub(crate) unsafe fn loadl_i32(buf: &[T], index: usize) -> v128 { 21 | let p = buf.get_unchecked(index..).as_ptr() as *const i32; 22 | i32x4(p.read_unaligned(), 0, 0, 0) 23 | } 24 | 25 | #[inline] 26 | #[target_feature(enable = "simd128")] 27 | pub(crate) unsafe fn loadl_i16(buf: &[T], index: usize) -> v128 { 28 | let p = buf.get_unchecked(index..).as_ptr() as *const i16; 29 | i16x8(p.read_unaligned(), 0, 0, 0, 0, 0, 0, 0) 30 | } 31 | 32 | #[inline] 33 | #[target_feature(enable = "simd128")] 34 | pub(crate) unsafe fn ptr_i16_to_set1_i64(buf: &[i16], index: usize) -> v128 { 35 | let p = buf.get_unchecked(index..).as_ptr() as *const i64; 36 | i64x2_splat(p.read_unaligned()) 37 | } 38 | 39 | #[inline] 40 | #[target_feature(enable = "simd128")] 41 | pub(crate) unsafe fn ptr_i16_to_set1_i32(buf: &[i16], index: usize) -> v128 { 42 | let p = buf.get_unchecked(index..).as_ptr() as *const i32; 43 | i32x4_splat(p.read_unaligned()) 44 | } 45 | 46 | #[inline] 47 | #[target_feature(enable = "simd128")] 48 | pub(crate) unsafe fn i32x4_extend_low_ptr_u8(buf: &[u8], index: usize) -> v128 { 49 | let p = buf.get_unchecked(index..).as_ptr() as *const v128; 50 | u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(v128_load(p))) 51 | } 52 | 53 | #[inline] 54 | #[target_feature(enable = "simd128")] 55 | pub(crate) unsafe fn i32x4_extend_low_ptr_u8x4(buf: &[U8x4], index: usize) -> v128 { 56 | let v: u32 = u32::from_le_bytes(buf.get_unchecked(index).0); 57 | u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(u32x4(v, 0, 0, 0))) 58 | } 59 | 60 | #[inline] 61 | #[target_feature(enable = "simd128")] 62 | pub(crate) unsafe fn i32x4_extend_low_ptr_u8x3(buf: &[U8x3], index: usize) -> v128 { 63 | let pixel = buf.get_unchecked(index).0; 64 | i32x4(pixel[0] as i32, pixel[1] as i32, pixel[2] as i32, 0) 65 | } 66 | 67 | #[inline] 68 | #[target_feature(enable = "simd128")] 69 | pub(crate) unsafe fn i32x4_v128_from_u8(buf: &[u8], index: usize) -> v128 { 70 | let p = buf.get_unchecked(index..).as_ptr() as *const i32; 71 | i32x4(p.read_unaligned(), 0, 0, 0) 72 | } 73 | 74 | // #[inline] 75 | // #[target_feature(enable = "simd128")] 76 | // pub(crate) unsafe fn u16x8_mul_shr16(a_u16x8: v128, b_u16x8: v128) -> v128 { 77 | // let lo_u32x4 = u32x4_extmul_low_u16x8(a_u16x8, b_u16x8); 78 | // let hi_u32x4 = u32x4_extmul_high_u16x8(a_u16x8, b_u16x8); 79 | // i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(lo_u32x4, hi_u32x4) 80 | // } 81 | 82 | pub(crate) unsafe fn u16x8_mul_add_shr16(a_u16x8: v128, b_u16x8: v128, c: v128) -> v128 { 83 | let lo_u32x4 = u32x4_extmul_low_u16x8(a_u16x8, b_u16x8); 84 | let hi_u32x4 = u32x4_extmul_high_u16x8(a_u16x8, b_u16x8); 85 | let lo_u32x4 = u32x4_add(lo_u32x4, c); 86 | let hi_u32x4 = u32x4_add(hi_u32x4, c); 87 | i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(lo_u32x4, hi_u32x4) 88 | } 89 | 90 | #[inline] 91 | #[target_feature(enable = "simd128")] 92 | pub(crate) unsafe fn i64x2_mul_lo(a: v128, b: v128) -> v128 { 93 | const SHUFFLE: v128 = i8x16(0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1); 94 | i64x2_extmul_low_i32x4(i8x16_swizzle(a, SHUFFLE), i8x16_swizzle(b, SHUFFLE)) 95 | } 96 | -------------------------------------------------------------------------------- /tests/image_view.rs: -------------------------------------------------------------------------------- 1 | use fast_image_resize::images::{TypedCroppedImageMut, TypedImage}; 2 | use fast_image_resize::pixels::U8; 3 | use fast_image_resize::{ImageView, ImageViewMut}; 4 | use testing::non_zero_u32; 5 | 6 | mod testing; 7 | 8 | mod split_by_width { 9 | use super::*; 10 | use fast_image_resize::images::{TypedCroppedImage, TypedImageRef}; 11 | 12 | fn split(img: &T) { 13 | for num_parts in 1..16 { 14 | let res = img 15 | .split_by_width(0, non_zero_u32(512), non_zero_u32(num_parts)) 16 | .unwrap(); 17 | assert_eq!(res.len() as u32, num_parts); 18 | let sum_width = res.iter().map(|v| v.width()).sum::(); 19 | assert_eq!(sum_width, 512); 20 | } 21 | } 22 | 23 | fn split_mut(img: &mut T) { 24 | for num_parts in 1..16 { 25 | let res = img 26 | .split_by_width_mut(0, non_zero_u32(512), non_zero_u32(num_parts)) 27 | .unwrap(); 28 | assert_eq!(res.len() as u32, num_parts); 29 | let sum_width = res.iter().map(|v| v.width()).sum::(); 30 | assert_eq!(sum_width, 512); 31 | } 32 | } 33 | 34 | #[test] 35 | fn typed_image_ref() { 36 | let width = 512; 37 | let height = 384; 38 | let buffer = vec![U8::new(0); (width * height) as usize]; 39 | let img = TypedImageRef::::new(width, height, &buffer).unwrap(); 40 | split(&img); 41 | } 42 | 43 | #[test] 44 | fn typed_image() { 45 | let mut img = TypedImage::::new(512, 384); 46 | split(&img); 47 | split_mut(&mut img); 48 | } 49 | 50 | #[test] 51 | fn typed_cropped_image() { 52 | let img = TypedImage::::new(512 + 20, 384 + 20); 53 | let cropped_img = TypedCroppedImage::from_ref(&img, 10, 10, 512, 384).unwrap(); 54 | split(&cropped_img); 55 | } 56 | 57 | #[test] 58 | fn typed_cropped_image_mut() { 59 | let mut img = TypedImage::::new(512 + 20, 384 + 20); 60 | let mut cropped_img = TypedCroppedImageMut::from_ref(&mut img, 10, 10, 512, 384).unwrap(); 61 | split(&cropped_img); 62 | split_mut(&mut cropped_img); 63 | } 64 | } 65 | 66 | mod split_by_height { 67 | use super::*; 68 | use fast_image_resize::images::{TypedCroppedImage, TypedImageRef}; 69 | 70 | fn split(img: &T) { 71 | for num_parts in 1..16 { 72 | let res = img 73 | .split_by_height(0, non_zero_u32(512), non_zero_u32(num_parts)) 74 | .unwrap(); 75 | assert_eq!(res.len() as u32, num_parts); 76 | let sum_height = res.iter().map(|v| v.height()).sum::(); 77 | assert_eq!(sum_height, 512); 78 | } 79 | } 80 | 81 | fn split_mut(img: &mut T) { 82 | for num_parts in 1..16 { 83 | let res = img 84 | .split_by_height_mut(0, non_zero_u32(512), non_zero_u32(num_parts)) 85 | .unwrap(); 86 | assert_eq!(res.len() as u32, num_parts); 87 | let sum_height = res.iter().map(|v| v.height()).sum::(); 88 | assert_eq!(sum_height, 512); 89 | } 90 | } 91 | 92 | #[test] 93 | fn typed_image_ref() { 94 | let width = 384; 95 | let height = 512; 96 | let buffer = vec![U8::new(0); (width * height) as usize]; 97 | let img = TypedImageRef::::new(width, height, &buffer).unwrap(); 98 | split(&img); 99 | } 100 | 101 | #[test] 102 | fn typed_image() { 103 | let mut img: TypedImage = TypedImage::new(384, 512); 104 | split(&img); 105 | split_mut(&mut img); 106 | } 107 | 108 | #[test] 109 | fn typed_cropped_image() { 110 | let img = TypedImage::::new(384 + 20, 512 + 20); 111 | let cropped_img = TypedCroppedImage::from_ref(&img, 10, 10, 384, 512).unwrap(); 112 | split(&cropped_img); 113 | } 114 | 115 | #[test] 116 | fn typed_cropped_image_mut() { 117 | let mut img: TypedImage = TypedImage::new(384 + 20, 512 + 20); 118 | let mut cropped_img = TypedCroppedImageMut::from_ref(&mut img, 10, 10, 384, 512).unwrap(); 119 | split(&cropped_img); 120 | split_mut(&mut cropped_img); 121 | } 122 | } 123 | --------------------------------------------------------------------------------