├── .github
    └── workflows
    │   └── check_and_test.yaml
├── .gitignore
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── benches
    ├── bench_alpha.rs
    ├── bench_color_mapper.rs
    ├── bench_compare_l.rs
    ├── bench_compare_l16.rs
    ├── bench_compare_l32f.rs
    ├── bench_compare_la.rs
    ├── bench_compare_la16.rs
    ├── bench_compare_la32f.rs
    ├── bench_compare_rgb.rs
    ├── bench_compare_rgb16.rs
    ├── bench_compare_rgb32f.rs
    ├── bench_compare_rgba.rs
    ├── bench_compare_rgba16.rs
    ├── bench_compare_rgba32f.rs
    ├── bench_resize.rs
    ├── templates
    │   ├── bench_compare_l.md.tera
    │   ├── bench_compare_l16.md.tera
    │   ├── bench_compare_l32f.md.tera
    │   ├── bench_compare_la.md.tera
    │   ├── bench_compare_la16.md.tera
    │   ├── bench_compare_la32f.md.tera
    │   ├── bench_compare_rgb.md.tera
    │   ├── bench_compare_rgb16.md.tera
    │   ├── bench_compare_rgb32f.md.tera
    │   ├── bench_compare_rgba.md.tera
    │   ├── bench_compare_rgba16.md.tera
    │   ├── bench_compare_rgba32f.md.tera
    │   └── introduction.md.tera
    └── utils
    │   ├── bencher.rs
    │   ├── mod.rs
    │   ├── resize_functions.rs
    │   ├── results.rs
    │   └── testing.rs
├── benchmarks-arm64.md
├── benchmarks-wasm32.md
├── benchmarks-x86_64.md
├── data
    ├── crop_test.png
    ├── nasa-4019x4019-rgba.png
    ├── nasa-4019x4019.png
    ├── nasa-4928x3279-rgba.png
    ├── nasa-4928x3279.png
    ├── nasa-852x567-rgba.png
    └── nasa-852x567.png
├── dev.md
├── resizer
    ├── Cargo.toml
    └── src
    │   ├── main.rs
    │   └── structs.rs
├── rustfmt.toml
├── src
    ├── alpha
    │   ├── common.rs
    │   ├── errors.rs
    │   ├── f32x2
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── f32x4
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── mod.rs
    │   ├── u16x2
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u16x4
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u8x2
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   └── u8x4
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    ├── array_chunks.rs
    ├── change_components_type.rs
    ├── color
    │   ├── mappers.rs
    │   └── mod.rs
    ├── convolution
    │   ├── f32x1
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── f32x2
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── f32x3
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── f32x4
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── filters.rs
    │   ├── i32x1
    │   │   ├── mod.rs
    │   │   └── native.rs
    │   ├── macros.rs
    │   ├── mod.rs
    │   ├── optimisations.rs
    │   ├── u16x1
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u16x2
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u16x3
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u16x4
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u8x1
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u8x2
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u8x3
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── u8x4
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   ├── vertical_f32
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   └── sse4.rs
    │   ├── vertical_u16
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    │   └── vertical_u8
    │   │   ├── avx2.rs
    │   │   ├── mod.rs
    │   │   ├── native.rs
    │   │   ├── neon.rs
    │   │   ├── sse4.rs
    │   │   └── wasm32.rs
    ├── cpu_extensions.rs
    ├── crop_box.rs
    ├── errors.rs
    ├── image_view.rs
    ├── images
    │   ├── cropped_image.rs
    │   ├── image.rs
    │   ├── image_crate.rs
    │   ├── mod.rs
    │   ├── typed_cropped_image.rs
    │   ├── typed_image.rs
    │   └── unsafe_image.rs
    ├── lib.rs
    ├── mul_div.rs
    ├── neon_utils.rs
    ├── pixels.rs
    ├── resizer.rs
    ├── simd_utils.rs
    ├── testing.rs
    ├── threading.rs
    ├── utils.rs
    └── wasm32_utils.rs
└── tests
    ├── alpha_tests.rs
    ├── color_tests.rs
    ├── image_view.rs
    ├── images_tests.rs
    ├── resize_tests.rs
    └── testing.rs


/.github/workflows/check_and_test.yaml:
--------------------------------------------------------------------------------
 1 | name: Check and Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 |   workflow_dispatch: { }
 9 | 
10 | env:
11 |   CARGO_TERM_COLOR: always
12 |   DONT_SAVE_RESULT: 1
13 |   RAYON_NUM_THREADS: 4
14 | 
15 | jobs:
16 |   run_tests:
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         os: [ ubuntu-latest, macos-latest, windows-latest ]
21 | 
22 |     name: Test `cargo check/test` on ${{ matrix.os }}
23 |     runs-on: ${{ matrix.os }}
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - uses: Swatinem/rust-cache@v2
29 |         with:
30 |           cache-on-failure: "true"
31 | 
32 |       - name: Run single-thread tests
33 |         run: |
34 |           cargo check
35 |           cargo test
36 | 
37 |       - name: Run multi-thread tests
38 |         run: |
39 |           cargo check --features rayon
40 |           cargo test --features rayon
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | .*
3 | !/.gitignore
4 | data/result
5 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
  1 | [workspace]
  2 | members = [
  3 |     "resizer",
  4 | ]
  5 | 
  6 | 
  7 | [package]
  8 | name = "fast_image_resize"
  9 | version = "5.1.4"
 10 | authors = ["Kirill Kuzminykh <cykooz@gmail.com>"]
 11 | edition = "2021"
 12 | license = "MIT OR Apache-2.0"
 13 | description = "Library for fast image resizing with using of SIMD instructions"
 14 | readme = "README.md"
 15 | keywords = ["image", "resize"]
 16 | repository = "https://github.com/cykooz/fast_image_resize"
 17 | documentation = "https://docs.rs/crate/fast_image_resize"
 18 | exclude = ["/data", "/.github"]
 19 | 
 20 | 
 21 | [dependencies]
 22 | cfg-if = "1.0"
 23 | num-traits = "0.2.19"
 24 | thiserror = "1.0"
 25 | document-features = "0.2.11"
 26 | # Optional dependencies
 27 | image = { version = "0.25.6", optional = true, default-features = false }
 28 | bytemuck = { version = "1.23", optional = true }
 29 | rayon = { version = "1.10", optional = true }
 30 | 
 31 | 
 32 | [features]
 33 | ## Enable this feature to implement traits [IntoImageView](crate::IntoImageView) and
 34 | ## [IntoImageViewMut](crate::IntoImageViewMut) for the
 35 | ## [DynamicImage](https://docs.rs/image/latest/image/enum.DynamicImage.html)
 36 | ## type from the `image` crate.
 37 | image = ["dep:image", "dep:bytemuck"]
 38 | ## This feature enables image processing in `rayon` thread pool.
 39 | rayon = ["dep:rayon", "resize/rayon", "image/rayon"]
 40 | for_testing = ["image", "image/png"]
 41 | only_u8x4 = []  # This can be used to experiment with the crate's code.
 42 | 
 43 | 
 44 | [dev-dependencies]
 45 | fast_image_resize = { path = ".", features = ["for_testing"] }
 46 | resize = { version = "0.8.8", default-features = false, features = ["std"] }
 47 | rgb = "0.8.50"
 48 | png = "0.17.16"
 49 | serde = { version = "1.0", features = ["serde_derive"] }
 50 | serde_json = "1.0"
 51 | walkdir = "2.5"
 52 | itertools = "0.14.0"
 53 | criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
 54 | tera = "1.20"
 55 | 
 56 | 
 57 | [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
 58 | nix = { version = "0.30.1", default-features = false, features = ["sched"] }
 59 | 
 60 | 
 61 | [target.'cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))'.dev-dependencies]
 62 | libvips = "1.7"
 63 | 
 64 | 
 65 | [profile.test]
 66 | opt-level = 1
 67 | incremental = true
 68 | 
 69 | 
 70 | # debug builds for deps
 71 | [profile.dev.package.'*']
 72 | opt-level = 3
 73 | 
 74 | 
 75 | # release build for procmacros - same config as debug build for procmacros
 76 | [profile.release.build-override]
 77 | opt-level = 2
 78 | debug = false # when possible
 79 | 
 80 | 
 81 | [profile.release]
 82 | opt-level = 3
 83 | incremental = true
 84 | #lto = true
 85 | #codegen-units = 1
 86 | strip = true
 87 | 
 88 | 
 89 | #[profile.release.package.fast_image_resize]
 90 | #codegen-units = 1
 91 | 
 92 | 
 93 | [profile.release.package.image]
 94 | codegen-units = 1
 95 | 
 96 | 
 97 | [profile.release.package.resize]
 98 | codegen-units = 1
 99 | 
100 | 
101 | [package.metadata.release]
102 | pre-release-replacements = [
103 |     { file = "CHANGELOG.md", search = "Unreleased", replace = "{{version}}" },
104 |     { file = "CHANGELOG.md", search = "ReleaseDate", replace = "{{date}}" }
105 | ]
106 | 
107 | 
108 | [[bench]]
109 | name = "bench_resize"
110 | harness = false
111 | 
112 | 
113 | [[bench]]
114 | name = "bench_alpha"
115 | harness = false
116 | 
117 | 
118 | [[bench]]
119 | name = "bench_compare_rgb"
120 | harness = false
121 | 
122 | 
123 | [[bench]]
124 | name = "bench_compare_rgb16"
125 | harness = false
126 | 
127 | 
128 | [[bench]]
129 | name = "bench_compare_rgb32f"
130 | harness = false
131 | 
132 | 
133 | [[bench]]
134 | name = "bench_compare_rgba"
135 | harness = false
136 | 
137 | 
138 | [[bench]]
139 | name = "bench_compare_rgba16"
140 | harness = false
141 | 
142 | 
143 | [[bench]]
144 | name = "bench_compare_rgba32f"
145 | harness = false
146 | 
147 | 
148 | [[bench]]
149 | name = "bench_compare_l"
150 | harness = false
151 | 
152 | 
153 | [[bench]]
154 | name = "bench_compare_la"
155 | harness = false
156 | 
157 | 
158 | [[bench]]
159 | name = "bench_compare_l16"
160 | harness = false
161 | 
162 | 
163 | [[bench]]
164 | name = "bench_compare_la16"
165 | harness = false
166 | 
167 | 
168 | [[bench]]
169 | name = "bench_compare_l32f"
170 | harness = false
171 | 
172 | 
173 | [[bench]]
174 | name = "bench_compare_la32f"
175 | harness = false
176 | 
177 | 
178 | [[bench]]
179 | name = "bench_color_mapper"
180 | harness = false
181 | 
182 | 
183 | # Header of next release in CHANGELOG.md:
184 | #   ## [Unreleased] - ReleaseDate
185 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Kirill Kuzminykh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/benches/bench_color_mapper.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::create_srgb_mapper;
 2 | use fast_image_resize::images::Image;
 3 | use fast_image_resize::pixels::U8x3;
 4 | use utils::pin_process_to_cpu0;
 5 | use utils::testing::PixelTestingExt;
 6 | 
 7 | mod utils;
 8 | 
 9 | pub fn bench_color_mapper(bench_group: &mut utils::BenchGroup) {
10 |     let src_image = U8x3::load_big_src_image();
11 |     let mut dst_image = Image::new(
12 |         src_image.width(),
13 |         src_image.height(),
14 |         src_image.pixel_type(),
15 |     );
16 |     let mapper = create_srgb_mapper();
17 |     bench_group
18 |         .criterion_group
19 |         .bench_function("SRGB U8x3 => RGB U8x3", |bencher| {
20 |             bencher.iter(|| {
21 |                 mapper.forward_map(&src_image, &mut dst_image).unwrap();
22 |             })
23 |         });
24 | }
25 | 
26 | fn main() {
27 |     pin_process_to_cpu0();
28 |     utils::run_bench(bench_color_mapper, "Color mapper");
29 | }
30 | 


--------------------------------------------------------------------------------
/benches/bench_compare_l.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U8;
 2 | use resize::Pixel::Gray8;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_compare_l(bench_group: &mut utils::BenchGroup) {
 9 |     type P = U8;
10 |     let src_image = P::load_big_image();
11 |     utils::image_resize(bench_group, &src_image);
12 |     utils::resize_resize(
13 |         bench_group,
14 |         Gray8,
15 |         src_image.as_raw().as_gray(),
16 |         src_image.width(),
17 |         src_image.height(),
18 |     );
19 |     utils::libvips_resize::<P>(bench_group, false);
20 |     utils::fir_resize::<P>(bench_group, false);
21 | }
22 | 
23 | fn main() {
24 |     let res = utils::run_bench(bench_compare_l, "Compare resize of U8 image");
25 |     utils::print_and_write_compare_result(&res);
26 | }
27 | 


--------------------------------------------------------------------------------
/benches/bench_compare_l16.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U16;
 2 | use resize::Pixel::Gray16;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_l16(bench_group: &mut utils::BenchGroup) {
 9 |     type P = U16;
10 |     let src_image = P::load_big_image();
11 |     utils::image_resize(bench_group, &src_image);
12 |     utils::resize_resize(
13 |         bench_group,
14 |         Gray16,
15 |         src_image.as_raw().as_gray(),
16 |         src_image.width(),
17 |         src_image.height(),
18 |     );
19 |     utils::libvips_resize::<P>(bench_group, false);
20 |     utils::fir_resize::<P>(bench_group, false);
21 | }
22 | 
23 | fn main() {
24 |     let res = utils::run_bench(bench_downscale_l16, "Compare resize of U16 image");
25 |     utils::print_and_write_compare_result(&res);
26 | }
27 | 


--------------------------------------------------------------------------------
/benches/bench_compare_l32f.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::F32;
 2 | use resize::Pixel::GrayF32;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_l32f(bench_group: &mut utils::BenchGroup) {
 9 |     type P = F32;
10 |     let src_image = P::load_big_image();
11 |     utils::image_resize(bench_group, &src_image);
12 |     utils::resize_resize(
13 |         bench_group,
14 |         GrayF32,
15 |         src_image.as_raw().as_gray(),
16 |         src_image.width(),
17 |         src_image.height(),
18 |     );
19 |     utils::libvips_resize::<P>(bench_group, false);
20 |     utils::fir_resize::<P>(bench_group, false);
21 | }
22 | 
23 | fn main() {
24 |     let res = utils::run_bench(bench_downscale_l32f, "Compare resize of L32F image");
25 |     utils::print_and_write_compare_result(&res);
26 | }
27 | 


--------------------------------------------------------------------------------
/benches/bench_compare_la.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U8x2;
 2 | 
 3 | mod utils;
 4 | 
 5 | pub fn bench_downscale_la(bench_group: &mut utils::BenchGroup) {
 6 |     type P = U8x2;
 7 |     utils::libvips_resize::<P>(bench_group, true);
 8 |     utils::fir_resize::<P>(bench_group, true);
 9 | }
10 | 
11 | fn main() {
12 |     let res = utils::run_bench(bench_downscale_la, "Compare resize of LA image");
13 |     utils::print_and_write_compare_result(&res);
14 | }
15 | 


--------------------------------------------------------------------------------
/benches/bench_compare_la16.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U16x2;
 2 | 
 3 | mod utils;
 4 | 
 5 | pub fn bench_downscale_la16(bench_group: &mut utils::BenchGroup) {
 6 |     type P = U16x2;
 7 |     utils::libvips_resize::<P>(bench_group, true);
 8 |     utils::fir_resize::<P>(bench_group, true);
 9 | }
10 | 
11 | fn main() {
12 |     let res = utils::run_bench(bench_downscale_la16, "Compare resize of LA16 image");
13 |     utils::print_and_write_compare_result(&res);
14 | }
15 | 


--------------------------------------------------------------------------------
/benches/bench_compare_la32f.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::F32x2;
 2 | 
 3 | mod utils;
 4 | 
 5 | pub fn bench_downscale_la32f(bench_group: &mut utils::BenchGroup) {
 6 |     type P = F32x2;
 7 |     utils::libvips_resize::<P>(bench_group, true);
 8 |     utils::fir_resize::<P>(bench_group, true);
 9 | }
10 | 
11 | fn main() {
12 |     let res = utils::run_bench(bench_downscale_la32f, "Compare resize of LA32F image");
13 |     utils::print_and_write_compare_result(&res);
14 | }
15 | 


--------------------------------------------------------------------------------
/benches/bench_compare_rgb.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U8x3;
 2 | use resize::Pixel::RGB8;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_rgb(bench_group: &mut utils::BenchGroup) {
 9 |     type P = U8x3;
10 |     let src_image = P::load_big_image();
11 |     utils::image_resize(bench_group, &src_image);
12 |     utils::resize_resize(
13 |         bench_group,
14 |         RGB8,
15 |         src_image.as_raw().as_rgb(),
16 |         src_image.width(),
17 |         src_image.height(),
18 |     );
19 |     utils::libvips_resize::<P>(bench_group, false);
20 |     utils::fir_resize::<P>(bench_group, false);
21 | }
22 | 
23 | fn main() {
24 |     let res = utils::run_bench(bench_downscale_rgb, "Compare resize of RGB image");
25 |     utils::print_and_write_compare_result(&res);
26 | }
27 | 


--------------------------------------------------------------------------------
/benches/bench_compare_rgb16.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U16x3;
 2 | use resize::Pixel::RGB16;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_rgb16(bench_group: &mut utils::BenchGroup) {
 9 |     type P = U16x3;
10 |     let src_image = P::load_big_image();
11 |     utils::image_resize(bench_group, &src_image);
12 |     utils::resize_resize(
13 |         bench_group,
14 |         RGB16,
15 |         src_image.as_raw().as_rgb(),
16 |         src_image.width(),
17 |         src_image.height(),
18 |     );
19 |     utils::libvips_resize::<P>(bench_group, false);
20 |     utils::fir_resize::<P>(bench_group, false);
21 | }
22 | 
23 | fn main() {
24 |     let res = utils::run_bench(bench_downscale_rgb16, "Compare resize of RGB16 image");
25 |     utils::print_and_write_compare_result(&res);
26 | }
27 | 


--------------------------------------------------------------------------------
/benches/bench_compare_rgb32f.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::F32x3;
 2 | use resize::Pixel::RGBF32;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_rgb32f(bench_group: &mut utils::BenchGroup) {
 9 |     type P = F32x3;
10 |     let src_image = P::load_big_image();
11 |     utils::image_resize(bench_group, &src_image);
12 |     utils::resize_resize(
13 |         bench_group,
14 |         RGBF32,
15 |         src_image.as_raw().as_rgb(),
16 |         src_image.width(),
17 |         src_image.height(),
18 |     );
19 |     utils::libvips_resize::<P>(bench_group, false);
20 |     utils::fir_resize::<P>(bench_group, false);
21 | }
22 | 
23 | fn main() {
24 |     let res = utils::run_bench(bench_downscale_rgb32f, "Compare resize of RGB32F image");
25 |     utils::print_and_write_compare_result(&res);
26 | }
27 | 


--------------------------------------------------------------------------------
/benches/bench_compare_rgba.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U8x4;
 2 | use resize::Pixel::RGBA8P;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_rgba(bench_group: &mut utils::BenchGroup) {
 9 |     type P = U8x4;
10 |     let src_image = P::load_big_image();
11 |     utils::resize_resize(
12 |         bench_group,
13 |         RGBA8P,
14 |         src_image.as_raw().as_rgba(),
15 |         src_image.width(),
16 |         src_image.height(),
17 |     );
18 |     utils::libvips_resize::<P>(bench_group, true);
19 |     utils::fir_resize::<P>(bench_group, true);
20 | }
21 | 
22 | fn main() {
23 |     let res = utils::run_bench(bench_downscale_rgba, "Compare resize of RGBA image");
24 |     utils::print_and_write_compare_result(&res);
25 | }
26 | 


--------------------------------------------------------------------------------
/benches/bench_compare_rgba16.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::U16x4;
 2 | use resize::Pixel::RGBA16P;
 3 | use rgb::FromSlice;
 4 | use utils::testing::PixelTestingExt;
 5 | 
 6 | mod utils;
 7 | 
 8 | pub fn bench_downscale_rgba16(bench_group: &mut utils::BenchGroup) {
 9 |     type P = U16x4;
10 |     let src_image = P::load_big_image();
11 |     utils::resize_resize(
12 |         bench_group,
13 |         RGBA16P,
14 |         src_image.as_raw().as_rgba(),
15 |         src_image.width(),
16 |         src_image.height(),
17 |     );
18 |     utils::libvips_resize::<P>(bench_group, true);
19 |     utils::fir_resize::<P>(bench_group, true);
20 | }
21 | 
22 | fn main() {
23 |     let res = utils::run_bench(bench_downscale_rgba16, "Compare resize of RGBA16 image");
24 |     utils::print_and_write_compare_result(&res);
25 | }
26 | 


--------------------------------------------------------------------------------
/benches/bench_compare_rgba32f.rs:
--------------------------------------------------------------------------------
 1 | use fast_image_resize::pixels::F32x4;
 2 | 
 3 | mod utils;
 4 | 
 5 | pub fn bench_downscale_rgba32f(bench_group: &mut utils::BenchGroup) {
 6 |     type P = F32x4;
 7 |     utils::libvips_resize::<P>(bench_group, true);
 8 |     utils::fir_resize::<P>(bench_group, true);
 9 | }
10 | 
11 | fn main() {
12 |     let res = utils::run_bench(bench_downscale_rgba32f, "Compare resize of RGBA32F image");
13 |     utils::print_and_write_compare_result(&res);
14 | }
15 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_l.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize L8 image (U8) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => resize => dst_image`
 6 | 
 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png)
 8 |   has converted into grayscale image with one byte per pixel.
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | 
11 | {{ compare_results -}}
12 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_l16.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize L16 image (U16) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => resize => dst_image`
 6 | 
 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png)
 8 |   has converted into grayscale image with two bytes per pixel.
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | 
11 | {{ compare_results -}}
12 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_l32f.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize L32F image (F32) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => resize => dst_image`
 6 | 
 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png)
 8 |   has converted into grayscale image with two bytes per pixel.
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | 
11 | {{ compare_results -}}
12 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_la.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize LA8 image (U8x2) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image`
 6 | 
 7 | - Source image
 8 |   [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png)
 9 |   has converted into grayscale image with an alpha channel (two bytes per pixel).
10 | - Numbers in the table mean a duration of image resizing in milliseconds.
11 | - The `image` crate does not support multiplying and dividing by alpha channel.
12 | - The `resize` crate does not support this pixel format.
13 | 
14 | {{ compare_results -}}
15 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_la16.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize LA16 (luma with alpha channel) image (U16x2) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image`
 6 | 
 7 | - Source image
 8 |   [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png)
 9 |   has converted into grayscale image with an alpha channel (four bytes per pixel).
10 | - Numbers in the table mean a duration of image resizing in milliseconds.
11 | - The `image` crate does not support multiplying and dividing by alpha channel.
12 | - The `resize` crate does not support this pixel format.
13 | 
14 | {{ compare_results -}}
15 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_la32f.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize LA32F (luma with alpha channel) image (F32x2) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image`
 6 | 
 7 | - Source image
 8 |   [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png)
 9 |   has converted into grayscale image with an alpha channel (two `f32` values per pixel).
10 | - Numbers in the table mean a duration of image resizing in milliseconds.
11 | - The `image` crate does not support multiplying and dividing by alpha channel.
12 | - The `resize` crate does not support this pixel format.
13 | 
14 | {{ compare_results -}}
15 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_rgb.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize RGB8 image (U8x3) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => resize => dst_image`
 6 | 
 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png)
 8 | - Numbers in the table mean a duration of image resizing in milliseconds.
 9 | 
10 | {{ compare_results -}}
11 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_rgb16.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize RGB16 image (U16x3) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => resize => dst_image`
 6 | 
 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png)
 8 |   has converted into RGB16 image.
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | 
11 | {{ compare_results -}}
12 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_rgb32f.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize RGB32F image (F32x3) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => resize => dst_image`
 6 | 
 7 | - Source image [nasa-4928x3279.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png)
 8 |   has converted into RGB32F image.
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | 
11 | {{ compare_results -}}
12 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_rgba.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize RGBA8 image (U8x4) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image`
 6 | 
 7 | - Source image
 8 |   [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png)
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | - The `image` crate does not support multiplying and dividing by alpha channel.
11 | 
12 | {{ compare_results -}}
13 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_rgba16.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize RGBA16 image (U16x4) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image`
 6 | 
 7 | - Source image
 8 |   [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png)
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | - The `image` crate does not support multiplying and dividing by alpha channel.
11 | 
12 | {{ compare_results -}}
13 | 


--------------------------------------------------------------------------------
/benches/templates/bench_compare_rgba32f.md.tera:
--------------------------------------------------------------------------------
 1 | ### Resize RGBA32F image (F32x4) 4928x3279 => 852x567
 2 | 
 3 | Pipeline:
 4 | 
 5 | `src_image => multiply by alpha => resize => divide by alpha => dst_image`
 6 | 
 7 | - Source image
 8 |   [nasa-4928x3279-rgba.png](https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279-rgba.png)
 9 | - Numbers in the table mean a duration of image resizing in milliseconds.
10 | - The `image` crate does not support multiplying and dividing by alpha channel.
11 | - The `resize` crate does not support  multiplying and dividing by alpha channel
12 |   for this pixel format.
13 | 
14 | {{ compare_results -}}
15 | 


--------------------------------------------------------------------------------
/benches/templates/introduction.md.tera:
--------------------------------------------------------------------------------
 1 | ## Benchmarks of fast_image_resize crate for {{ arch_name }} architecture
 2 | 
 3 | Environment:
 4 | 
 5 | {% if arch_id == "arm64" -%}
 6 | - CPU: Neoverse-N1 2GHz (Oracle Cloud Compute, VM.Standard.A1.Flex)
 7 | {% else -%}
 8 | - CPU: AMD Ryzen 9 5950X
 9 | - RAM: DDR4 4000 MHz
10 | {% endif -%}
11 | - Ubuntu 24.04 (linux 6.11.0)
12 | - Rust 1.87.0
13 | - criterion = "0.5.1"
14 | - fast_image_resize = "5.1.4"
15 | {% if arch_id == "wasm32" -%}
16 | - wasmtime = "32.0.0"
17 | {% endif %}
18 | 
19 | Other libraries used to compare of resizing speed:
20 | 
21 | - image = "0.25.6" (<https://crates.io/crates/image>)
22 | - resize = "0.8.8" (<https://crates.io/crates/resize>, single-threaded mode)
23 | {% if arch_id != "wasm32" -%}
24 | - libvips = "8.15.1" (single-threaded mode)
25 | {% endif %}
26 | 
27 | Resize algorithms:
28 | 
29 | - Nearest
30 | - Box - convolution with minimal kernel size 1x1 px
31 | - Bilinear - convolution with minimal kernel size 2x2 px
32 | - Bicubic (CatmullRom) - convolution with minimal kernel size 4x4 px
33 | - Lanczos3 - convolution with minimal kernel size 6x6 px
34 | 


--------------------------------------------------------------------------------
/benches/utils/bencher.rs:
--------------------------------------------------------------------------------
  1 | use std::env;
  2 | use std::path::PathBuf;
  3 | use std::time::{Duration, SystemTime};
  4 | 
  5 | use criterion::measurement::WallTime;
  6 | use criterion::{Bencher, BenchmarkGroup, BenchmarkId, Criterion};
  7 | 
  8 | use super::{cargo_target_directory, get_arch_id_and_name, get_results, BenchResult};
  9 | 
 10 | pub struct BenchGroup<'a> {
 11 |     pub criterion_group: BenchmarkGroup<'a, WallTime>,
 12 |     old_results: Vec<BenchResult>,
 13 |     results: Vec<BenchResult>,
 14 | }
 15 | 
 16 | impl<'a> BenchGroup<'a> {
 17 |     fn finish(self) -> Vec<BenchResult> {
 18 |         self.criterion_group.finish();
 19 |         self.results
 20 |     }
 21 | }
 22 | 
 23 | pub fn run_bench<F>(bench_fn: F, name: &str) -> Vec<BenchResult>
 24 | where
 25 |     F: FnOnce(&mut BenchGroup),
 26 | {
 27 |     if env::var("PIN_TO_CPU0").is_ok() {
 28 |         pin_process_to_cpu0();
 29 |     }
 30 | 
 31 |     let arch_id = get_arch_id_and_name().0;
 32 |     let output_dir = criterion_output_directory().join(arch_id);
 33 |     let mut criterion = Criterion::default()
 34 |         .output_directory(&output_dir)
 35 |         .configure_from_args();
 36 | 
 37 |     let now = SystemTime::now();
 38 |     let results_dir = output_dir.join(name);
 39 | 
 40 |     let results_lifetime: u32 = env::var("RESULTS_LIFETIME")
 41 |         .unwrap_or_else(|_| "0".to_owned())
 42 |         .parse()
 43 |         .unwrap_or_default();
 44 |     let old_results = if results_lifetime > 0 && name.starts_with("Compare ") {
 45 |         let old_now = now - Duration::from_secs(results_lifetime as u64 * 24 * 3600);
 46 |         get_results(&results_dir, &old_now)
 47 |     } else {
 48 |         vec![]
 49 |     };
 50 | 
 51 |     let mut group = BenchGroup {
 52 |         criterion_group: criterion.benchmark_group(name),
 53 |         old_results,
 54 |         results: vec![],
 55 |     };
 56 |     bench_fn(&mut group);
 57 |     let mut results = group.finish();
 58 |     criterion.final_summary();
 59 | 
 60 |     let new_results = get_results(&results_dir, &now);
 61 |     if new_results.is_empty() {
 62 |         new_results
 63 |     } else {
 64 |         for res in results.iter_mut().filter(|r| r.estimate < 0.) {
 65 |             res.estimate = new_results
 66 |                 .iter()
 67 |                 .find(|new_res| {
 68 |                     new_res.function_name == res.function_name && new_res.parameter == res.parameter
 69 |                 })
 70 |                 .map(|r| r.estimate)
 71 |                 .unwrap_or(0.)
 72 |         }
 73 |         results
 74 |     }
 75 | }
 76 | 
 77 | pub fn bench<S1, S2, F>(
 78 |     group: &mut BenchGroup,
 79 |     sample_size: usize,
 80 |     func_name: S1,
 81 |     parameter: S2,
 82 |     mut f: F,
 83 | ) where
 84 |     S1: Into<String>,
 85 |     S2: Into<String>,
 86 |     F: FnMut(&mut Bencher),
 87 | {
 88 |     let parameter = parameter.into();
 89 |     let func_name = func_name.into();
 90 |     // Use old results only for other libraries, not for 'fast_image_resize'
 91 |     if !func_name.starts_with("fir ") {
 92 |         if let Some(old_res) = group
 93 |             .old_results
 94 |             .iter()
 95 |             .find(|res| res.function_name == func_name && res.parameter == parameter)
 96 |         {
 97 |             group.results.push(old_res.clone());
 98 |             println!(
 99 |                 "SKIP benching of '{}' function with '{}' parameter due to using old result.",
100 |                 func_name, parameter
101 |             );
102 |             return;
103 |         }
104 |     }
105 | 
106 |     group.results.push(BenchResult {
107 |         function_name: func_name.clone(),
108 |         parameter: parameter.clone(),
109 |         estimate: -1., // Unknown result
110 |     });
111 | 
112 |     group.criterion_group.sample_size(sample_size);
113 |     group.criterion_group.bench_with_input(
114 |         BenchmarkId::new(func_name, &parameter),
115 |         &parameter,
116 |         |bencher, _| f(bencher),
117 |     );
118 | }
119 | 
120 | /// Pin process to #0 CPU core
121 | pub fn pin_process_to_cpu0() {
122 |     #[cfg(not(target_arch = "wasm32"))]
123 |     {
124 |         let mut cpu_set = nix::sched::CpuSet::new();
125 |         cpu_set.set(0).unwrap();
126 |         nix::sched::sched_setaffinity(nix::unistd::Pid::from_raw(0), &cpu_set).unwrap();
127 |     }
128 | }
129 | 
130 | fn criterion_output_directory() -> PathBuf {
131 |     if let Some(value) = env::var_os("CRITERION_HOME") {
132 |         PathBuf::from(value)
133 |     } else if let Some(path) = cargo_target_directory() {
134 |         path.join("criterion")
135 |     } else {
136 |         PathBuf::from("target/criterion")
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/benches/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | use std::process::Command;
 4 | 
 5 | pub use bencher::*;
 6 | pub use resize_functions::*;
 7 | pub use results::*;
 8 | use serde::Deserialize;
 9 | 
10 | mod bencher;
11 | mod resize_functions;
12 | mod results;
13 | pub mod testing;
14 | 
15 | const fn get_arch_id_and_name() -> (&'static str, &'static str) {
16 |     #[cfg(target_arch = "x86_64")]
17 |     return ("x86_64", "x86_64");
18 |     #[cfg(target_arch = "aarch64")]
19 |     return ("arm64", "arm64");
20 |     #[cfg(target_arch = "wasm32")]
21 |     return ("wasm32", "Wasm32");
22 |     #[cfg(not(any(
23 |         target_arch = "x86_64",
24 |         target_arch = "aarch64",
25 |         target_arch = "wasm32"
26 |     )))]
27 |     return ("unknown", "Unknown");
28 | }
29 | 
30 | /// Returns the Cargo target directory, possibly calling `cargo metadata` to
31 | /// figure it out.
32 | fn cargo_target_directory() -> Option<PathBuf> {
33 |     #[derive(Deserialize)]
34 |     struct Metadata {
35 |         target_directory: PathBuf,
36 |     }
37 | 
38 |     env::var_os("CARGO_TARGET_DIR")
39 |         .map(PathBuf::from)
40 |         .or_else(|| {
41 |             let output = Command::new(env::var_os("CARGO")?)
42 |                 .args(["metadata", "--format-version", "1"])
43 |                 .output()
44 |                 .ok()?;
45 |             let metadata: Metadata = serde_json::from_slice(&output.stdout).ok()?;
46 |             Some(metadata.target_directory)
47 |         })
48 | }
49 | 


--------------------------------------------------------------------------------
/benches/utils/testing.rs:
--------------------------------------------------------------------------------
1 | ../../tests/testing.rs


--------------------------------------------------------------------------------
/data/crop_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/crop_test.png


--------------------------------------------------------------------------------
/data/nasa-4019x4019-rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4019x4019-rgba.png


--------------------------------------------------------------------------------
/data/nasa-4019x4019.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4019x4019.png


--------------------------------------------------------------------------------
/data/nasa-4928x3279-rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4928x3279-rgba.png


--------------------------------------------------------------------------------
/data/nasa-4928x3279.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-4928x3279.png


--------------------------------------------------------------------------------
/data/nasa-852x567-rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-852x567-rgba.png


--------------------------------------------------------------------------------
/data/nasa-852x567.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cykooz/fast_image_resize/a063410fb037d6ae4a5374c04a7a31bb28c71110/data/nasa-852x567.png


--------------------------------------------------------------------------------
/dev.md:
--------------------------------------------------------------------------------
 1 | # Preparation
 2 | 
 3 | Install system libraries:
 4 | 
 5 | - libvips-dev (used in benchmarks)
 6 | 
 7 | Install additional toolchains:
 8 | 
 9 | - Arm64:
10 |   ```shell
11 |   rustup target add aarch64-unknown-linux-gnu
12 |   ```
13 | - Wasm32:
14 |   ```shell
15 |   rustup target add wasm32-wasip2
16 |   ```
17 |   Install [Wasmtime](https://wasmtime.dev/).
18 | 
19 | # Tests
20 | 
21 | Run tests with saving result images as files in `./data` directory:
22 | 
23 | ```shell
24 | SAVE_RESULT=1 cargo test
25 | ```
26 | 
27 | # Benchmarks
28 | 
29 | Run benchmarks to compare with other crates for image resizing and write results into
30 | report files, such as `./benchmarks-x86_64.md`:
31 | 
32 | ```shell
33 | WRITE_COMPARE_RESULT=1 cargo bench -- Compare
34 | ```
35 | 
36 | If you want to use old benchmark results for other crates, you must add
37 | an env variable with the number of days as a result lifetime:
38 | 
39 | ```shell
40 | WRITE_COMPARE_RESULT=1 RESULTS_LIFETIME=5 cargo bench -- Compare
41 | ```
42 | 
43 | # Wasm32
44 | 
45 | Specify build target and runner in `.cargo/config.toml` file.
46 | 
47 | ```toml
48 | [build]
49 | target = "wasm32-wasip2"
50 | 
51 | [target.wasm32-wasip2]
52 | runner = "wasmtime --dir=. --"
53 | ```
54 | 
55 | Run tests:
56 | 
57 | ```shell
58 | cargo test
59 | ```
60 | 
61 | Run tests with saving result images as files in `./data` directory:
62 | 
63 | ```shell
64 | CARGO_TARGET_WASM32_WASIP2_RUNNER="wasmtime --dir=. --env SAVE_RESULT=1 --" cargo test
65 | ```
66 | 
67 | Run a specific benchmark in `quick` mode:
68 | 
69 | ```shell
70 | cargo bench --bench bench_resize -- --color=always --quick
71 | ```
72 | 
73 | Run benchmarks to compare with other crates for image resizing and write results into
74 | report files, such as `./benchmarks-wasm32.md`:
75 | 
76 | ```shell
77 | CARGO_TARGET_WASM32_WASIP2_RUNNER="wasmtime --dir=. --env WRITE_COMPARE_RESULT=1 --" cargo bench --no-fail-fast -- --color=always Compare
78 | ```
79 | 


--------------------------------------------------------------------------------
/resizer/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "resizer"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | 
 7 | [dependencies]
 8 | fast_image_resize = { path = "..", features = ["image"] }
 9 | image = "0.25.2"
10 | clap = { version = "4.5", features = ["derive"] }
11 | log = "0.4.22"
12 | env_logger = "0.11.3"
13 | anyhow = "1.0"
14 | clap-verbosity-flag = "2.2"
15 | once_cell = "1.19"
16 | 
17 | 
18 | [package.metadata.release]
19 | publish = false
20 | 


--------------------------------------------------------------------------------
/resizer/src/structs.rs:
--------------------------------------------------------------------------------
 1 | use std::num::ParseIntError;
 2 | use std::str::FromStr;
 3 | 
 4 | use fast_image_resize as fr;
 5 | 
 6 | #[derive(Copy, Clone, Debug)]
 7 | pub enum Size {
 8 |     Pixels(u32),
 9 |     Percent(u16),
10 | }
11 | 
12 | impl Size {
13 |     pub fn calculate_size(&self, src_size: u32) -> u32 {
14 |         match *self {
15 |             Self::Pixels(size) => size,
16 |             Self::Percent(percent) => (src_size as f32 * percent as f32 / 100.).round() as u32,
17 |         }
18 |     }
19 | }
20 | 
21 | impl FromStr for Size {
22 |     type Err = ParseIntError;
23 | 
24 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
25 |         if let Some(percent_str) = s.strip_suffix('%') {
26 |             u16::from_str(percent_str).map(Self::Percent)
27 |         } else {
28 |             u32::from_str(s).map(Self::Pixels)
29 |         }
30 |     }
31 | }
32 | 
33 | #[derive(Copy, Clone, Debug, clap::ValueEnum)]
34 | pub enum Algorithm {
35 |     Nearest,
36 |     Convolution,
37 |     SuperSampling,
38 | }
39 | 
40 | #[derive(Copy, Clone, Debug, clap::ValueEnum)]
41 | pub enum FilterType {
42 |     /// Each pixel of source image contributes to one pixel of the
43 |     /// destination image with identical weights. For upscaling is equivalent
44 |     /// of `Nearest` resize algorithm.    
45 |     Box,
46 |     /// Bilinear filter calculate the output pixel value using linear
47 |     /// interpolation on all pixels that may contribute to the output value.
48 |     Bilinear,
49 |     /// Hamming filter has the same performance as `Bilinear` filter while
50 |     /// providing the image downscaling quality comparable to bicubic
51 |     /// (`CatmulRom` or `Mitchell`). Produces a sharper image than `Bilinear`,
52 |     /// doesn't have dislocations on local level like with `Box`.
53 |     /// The filter don’t show good quality for the image upscaling.
54 |     Hamming,
55 |     /// Catmull-Rom bicubic filter calculate the output pixel value using
56 |     /// cubic interpolation on all pixels that may contribute to the output
57 |     /// value.
58 |     CatmullRom,
59 |     /// Mitchell–Netravali bicubic filter calculate the output pixel value
60 |     /// using cubic interpolation on all pixels that may contribute to the
61 |     /// output value.
62 |     Mitchell,
63 |     /// Lanczos3 filter calculate the output pixel value using a high-quality
64 |     /// Lanczos filter (a truncated sinc) on all pixels that may contribute
65 |     /// to the output value.
66 |     Lanczos3,
67 | }
68 | 
69 | impl From<FilterType> for fr::FilterType {
70 |     fn from(filter_type: FilterType) -> Self {
71 |         match filter_type {
72 |             FilterType::Box => fr::FilterType::Box,
73 |             FilterType::Bilinear => fr::FilterType::Bilinear,
74 |             FilterType::Hamming => fr::FilterType::Hamming,
75 |             FilterType::CatmullRom => fr::FilterType::CatmullRom,
76 |             FilterType::Mitchell => fr::FilterType::Mitchell,
77 |             FilterType::Lanczos3 => fr::FilterType::Lanczos3,
78 |         }
79 |     }
80 | }
81 | 
82 | #[derive(Copy, Clone, Debug, clap::ValueEnum)]
83 | pub enum ColorSpace {
84 |     Linear,
85 |     /// sRGB for color images or gamma 2.2 for grayscale images
86 |     NonLinear,
87 | }
88 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | unstable_features = true
2 | 
3 | imports_granularity = "Module"
4 | group_imports = "StdExternalCrate"
5 | 


--------------------------------------------------------------------------------
/src/alpha/errors.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | use crate::ImageError;
 4 | 
 5 | #[derive(Error, Debug, Clone, Copy)]
 6 | #[non_exhaustive]
 7 | pub enum MulDivImagesError {
 8 |     #[error("Source or destination image is not supported")]
 9 |     ImageError(#[from] ImageError),
10 |     #[error("Size of source image does not match to destination image")]
11 |     SizeIsDifferent,
12 |     #[error("Pixel type of source image does not match to destination image")]
13 |     PixelTypesAreDifferent,
14 | }
15 | 


--------------------------------------------------------------------------------
/src/alpha/f32x2/mod.rs:
--------------------------------------------------------------------------------
  1 | use crate::cpu_extensions::CpuExtensions;
  2 | use crate::pixels::F32x2;
  3 | use crate::{ImageError, ImageView, ImageViewMut};
  4 | 
  5 | use super::AlphaMulDiv;
  6 | 
  7 | #[cfg(target_arch = "x86_64")]
  8 | mod avx2;
  9 | mod native;
 10 | #[cfg(target_arch = "x86_64")]
 11 | mod sse4;
 12 | 
 13 | type P = F32x2;
 14 | 
 15 | impl AlphaMulDiv for P {
 16 |     fn multiply_alpha(
 17 |         src_view: &impl ImageView<Pixel = Self>,
 18 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 19 |         cpu_extensions: CpuExtensions,
 20 |     ) -> Result<(), ImageError> {
 21 |         process_two_images! {
 22 |             multiple(src_view, dst_view, cpu_extensions);
 23 |         }
 24 |         Ok(())
 25 |     }
 26 | 
 27 |     fn multiply_alpha_inplace(
 28 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 29 |         cpu_extensions: CpuExtensions,
 30 |     ) -> Result<(), ImageError> {
 31 |         process_one_images! {
 32 |             multiply_inplace(image_view, cpu_extensions);
 33 |         }
 34 |         Ok(())
 35 |     }
 36 | 
 37 |     fn divide_alpha(
 38 |         src_view: &impl ImageView<Pixel = Self>,
 39 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 40 |         cpu_extensions: CpuExtensions,
 41 |     ) -> Result<(), ImageError> {
 42 |         process_two_images! {
 43 |             divide(src_view, dst_view, cpu_extensions);
 44 |         }
 45 |         Ok(())
 46 |     }
 47 | 
 48 |     fn divide_alpha_inplace(
 49 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 50 |         cpu_extensions: CpuExtensions,
 51 |     ) -> Result<(), ImageError> {
 52 |         process_one_images! {
 53 |             divide_inplace(image_view, cpu_extensions);
 54 |         }
 55 |         Ok(())
 56 |     }
 57 | }
 58 | 
 59 | fn multiple(
 60 |     src_view: &impl ImageView<Pixel = P>,
 61 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 62 |     cpu_extensions: CpuExtensions,
 63 | ) {
 64 |     match cpu_extensions {
 65 |         #[cfg(target_arch = "x86_64")]
 66 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) },
 67 |         #[cfg(target_arch = "x86_64")]
 68 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) },
 69 |         // #[cfg(target_arch = "aarch64")]
 70 |         // CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) },
 71 |         // #[cfg(target_arch = "wasm32")]
 72 |         // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) },
 73 |         _ => native::multiply_alpha(src_view, dst_view),
 74 |     }
 75 | }
 76 | 
 77 | fn multiply_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
 78 |     match cpu_extensions {
 79 |         #[cfg(target_arch = "x86_64")]
 80 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) },
 81 |         #[cfg(target_arch = "x86_64")]
 82 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) },
 83 |         // #[cfg(target_arch = "aarch64")]
 84 |         // CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) },
 85 |         // #[cfg(target_arch = "wasm32")]
 86 |         // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) },
 87 |         _ => native::multiply_alpha_inplace(image_view),
 88 |     }
 89 | }
 90 | 
 91 | fn divide(
 92 |     src_view: &impl ImageView<Pixel = P>,
 93 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 94 |     cpu_extensions: CpuExtensions,
 95 | ) {
 96 |     match cpu_extensions {
 97 |         #[cfg(target_arch = "x86_64")]
 98 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) },
 99 |         #[cfg(target_arch = "x86_64")]
100 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) },
101 |         // #[cfg(target_arch = "aarch64")]
102 |         // CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) },
103 |         // #[cfg(target_arch = "wasm32")]
104 |         // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) },
105 |         _ => native::divide_alpha(src_view, dst_view),
106 |     }
107 | }
108 | 
109 | fn divide_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
110 |     match cpu_extensions {
111 |         #[cfg(target_arch = "x86_64")]
112 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) },
113 |         #[cfg(target_arch = "x86_64")]
114 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) },
115 |         // #[cfg(target_arch = "aarch64")]
116 |         // CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) },
117 |         // #[cfg(target_arch = "wasm32")]
118 |         // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) },
119 |         _ => native::divide_alpha_inplace(image_view),
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/alpha/f32x2/native.rs:
--------------------------------------------------------------------------------
 1 | use num_traits::Zero;
 2 | 
 3 | use crate::pixels::F32x2;
 4 | use crate::utils::foreach_with_pre_reading;
 5 | use crate::{ImageView, ImageViewMut};
 6 | 
 7 | pub(crate) fn multiply_alpha(
 8 |     src_view: &impl ImageView<Pixel = F32x2>,
 9 |     dst_view: &mut impl ImageViewMut<Pixel = F32x2>,
10 | ) {
11 |     let src_rows = src_view.iter_rows(0);
12 |     let dst_rows = dst_view.iter_rows_mut(0);
13 | 
14 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
15 |         multiply_alpha_row(src_row, dst_row);
16 |     }
17 | }
18 | 
19 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = F32x2>) {
20 |     for row in image_view.iter_rows_mut(0) {
21 |         multiply_alpha_row_inplace(row);
22 |     }
23 | }
24 | 
25 | #[inline(always)]
26 | pub(crate) fn multiply_alpha_row(src_row: &[F32x2], dst_row: &mut [F32x2]) {
27 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
28 |         let components: [f32; 2] = src_pixel.0;
29 |         let alpha = components[1];
30 |         dst_pixel.0 = [components[0] * alpha, alpha];
31 |     }
32 | }
33 | 
34 | #[inline(always)]
35 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [F32x2]) {
36 |     for pixel in row {
37 |         pixel.0[0] *= pixel.0[1];
38 |     }
39 | }
40 | 
41 | // Divide
42 | 
43 | #[inline]
44 | pub(crate) fn divide_alpha(
45 |     src_view: &impl ImageView<Pixel = F32x2>,
46 |     dst_view: &mut impl ImageViewMut<Pixel = F32x2>,
47 | ) {
48 |     let src_rows = src_view.iter_rows(0);
49 |     let dst_rows = dst_view.iter_rows_mut(0);
50 | 
51 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
52 |         divide_alpha_row(src_row, dst_row);
53 |     }
54 | }
55 | 
56 | #[inline]
57 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = F32x2>) {
58 |     for row in image_view.iter_rows_mut(0) {
59 |         divide_alpha_row_inplace(row);
60 |     }
61 | }
62 | 
63 | #[inline(always)]
64 | pub(crate) fn divide_alpha_row(src_row: &[F32x2], dst_row: &mut [F32x2]) {
65 |     foreach_with_pre_reading(
66 |         src_row.iter().zip(dst_row),
67 |         |(&src_pixel, dst_pixel)| (src_pixel, dst_pixel),
68 |         |(src_pixel, dst_pixel)| {
69 |             let alpha = src_pixel.0[1];
70 |             if alpha.is_zero() {
71 |                 dst_pixel.0 = [0.; 2];
72 |             } else {
73 |                 dst_pixel.0 = [src_pixel.0[0] / alpha, alpha];
74 |             }
75 |         },
76 |     );
77 | }
78 | 
79 | #[inline(always)]
80 | pub(crate) fn divide_alpha_row_inplace(row: &mut [F32x2]) {
81 |     for pixel in row {
82 |         let components: [f32; 2] = pixel.0;
83 |         let alpha = components[1];
84 |         if alpha.is_zero() {
85 |             pixel.0[0] = 0.;
86 |         } else {
87 |             pixel.0[0] = components[0] / alpha;
88 |         }
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/alpha/f32x4/mod.rs:
--------------------------------------------------------------------------------
  1 | use crate::cpu_extensions::CpuExtensions;
  2 | use crate::pixels::F32x4;
  3 | use crate::{ImageError, ImageView, ImageViewMut};
  4 | 
  5 | use super::AlphaMulDiv;
  6 | 
  7 | #[cfg(target_arch = "x86_64")]
  8 | mod avx2;
  9 | mod native;
 10 | #[cfg(target_arch = "x86_64")]
 11 | mod sse4;
 12 | 
 13 | type P = F32x4;
 14 | 
 15 | impl AlphaMulDiv for P {
 16 |     fn multiply_alpha(
 17 |         src_view: &impl ImageView<Pixel = Self>,
 18 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 19 |         cpu_extensions: CpuExtensions,
 20 |     ) -> Result<(), ImageError> {
 21 |         process_two_images! {
 22 |             multiple(src_view, dst_view, cpu_extensions);
 23 |         }
 24 |         Ok(())
 25 |     }
 26 | 
 27 |     fn multiply_alpha_inplace(
 28 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 29 |         cpu_extensions: CpuExtensions,
 30 |     ) -> Result<(), ImageError> {
 31 |         process_one_images! {
 32 |             multiply_inplace(image_view, cpu_extensions);
 33 |         }
 34 |         Ok(())
 35 |     }
 36 | 
 37 |     fn divide_alpha(
 38 |         src_view: &impl ImageView<Pixel = Self>,
 39 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 40 |         cpu_extensions: CpuExtensions,
 41 |     ) -> Result<(), ImageError> {
 42 |         process_two_images! {
 43 |             divide(src_view, dst_view, cpu_extensions);
 44 |         }
 45 |         Ok(())
 46 |     }
 47 | 
 48 |     fn divide_alpha_inplace(
 49 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 50 |         cpu_extensions: CpuExtensions,
 51 |     ) -> Result<(), ImageError> {
 52 |         process_one_images! {
 53 |             divide_inplace(image_view, cpu_extensions);
 54 |         }
 55 |         Ok(())
 56 |     }
 57 | }
 58 | 
 59 | fn multiple(
 60 |     src_view: &impl ImageView<Pixel = P>,
 61 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 62 |     cpu_extensions: CpuExtensions,
 63 | ) {
 64 |     match cpu_extensions {
 65 |         #[cfg(target_arch = "x86_64")]
 66 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) },
 67 |         #[cfg(target_arch = "x86_64")]
 68 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) },
 69 |         // #[cfg(target_arch = "aarch64")]
 70 |         // CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) },
 71 |         // #[cfg(target_arch = "wasm32")]
 72 |         // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) },
 73 |         _ => native::multiply_alpha(src_view, dst_view),
 74 |     }
 75 | }
 76 | 
 77 | fn multiply_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
 78 |     match cpu_extensions {
 79 |         #[cfg(target_arch = "x86_64")]
 80 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) },
 81 |         #[cfg(target_arch = "x86_64")]
 82 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) },
 83 |         // #[cfg(target_arch = "aarch64")]
 84 |         // CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) },
 85 |         // #[cfg(target_arch = "wasm32")]
 86 |         // CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) },
 87 |         _ => native::multiply_alpha_inplace(image_view),
 88 |     }
 89 | }
 90 | 
 91 | fn divide(
 92 |     src_view: &impl ImageView<Pixel = P>,
 93 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 94 |     cpu_extensions: CpuExtensions,
 95 | ) {
 96 |     match cpu_extensions {
 97 |         #[cfg(target_arch = "x86_64")]
 98 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) },
 99 |         #[cfg(target_arch = "x86_64")]
100 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) },
101 |         // #[cfg(target_arch = "aarch64")]
102 |         // CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) },
103 |         // #[cfg(target_arch = "wasm32")]
104 |         // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) },
105 |         _ => native::divide_alpha(src_view, dst_view),
106 |     }
107 | }
108 | 
109 | fn divide_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
110 |     match cpu_extensions {
111 |         #[cfg(target_arch = "x86_64")]
112 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) },
113 |         #[cfg(target_arch = "x86_64")]
114 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) },
115 |         // #[cfg(target_arch = "aarch64")]
116 |         // CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) },
117 |         // #[cfg(target_arch = "wasm32")]
118 |         // CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) },
119 |         _ => native::divide_alpha_inplace(image_view),
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/alpha/f32x4/native.rs:
--------------------------------------------------------------------------------
  1 | use num_traits::Zero;
  2 | 
  3 | use crate::pixels::F32x4;
  4 | use crate::utils::foreach_with_pre_reading;
  5 | use crate::{ImageView, ImageViewMut};
  6 | 
  7 | pub(crate) fn multiply_alpha(
  8 |     src_view: &impl ImageView<Pixel = F32x4>,
  9 |     dst_view: &mut impl ImageViewMut<Pixel = F32x4>,
 10 | ) {
 11 |     let src_rows = src_view.iter_rows(0);
 12 |     let dst_rows = dst_view.iter_rows_mut(0);
 13 | 
 14 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
 15 |         multiply_alpha_row(src_row, dst_row);
 16 |     }
 17 | }
 18 | 
 19 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = F32x4>) {
 20 |     for row in image_view.iter_rows_mut(0) {
 21 |         multiply_alpha_row_inplace(row);
 22 |     }
 23 | }
 24 | 
 25 | #[inline(always)]
 26 | pub(crate) fn multiply_alpha_row(src_row: &[F32x4], dst_row: &mut [F32x4]) {
 27 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
 28 |         let components = src_pixel.0;
 29 |         let alpha = components[3];
 30 |         dst_pixel.0 = [
 31 |             components[0] * alpha,
 32 |             components[1] * alpha,
 33 |             components[2] * alpha,
 34 |             alpha,
 35 |         ];
 36 |     }
 37 | }
 38 | 
 39 | #[inline(always)]
 40 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [F32x4]) {
 41 |     for pixel in row {
 42 |         let alpha = pixel.0[3];
 43 |         pixel.0[0] *= alpha;
 44 |         pixel.0[1] *= alpha;
 45 |         pixel.0[2] *= alpha;
 46 |     }
 47 | }
 48 | 
 49 | // Divide
 50 | 
 51 | #[inline]
 52 | pub(crate) fn divide_alpha(
 53 |     src_view: &impl ImageView<Pixel = F32x4>,
 54 |     dst_view: &mut impl ImageViewMut<Pixel = F32x4>,
 55 | ) {
 56 |     let src_rows = src_view.iter_rows(0);
 57 |     let dst_rows = dst_view.iter_rows_mut(0);
 58 | 
 59 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
 60 |         divide_alpha_row(src_row, dst_row);
 61 |     }
 62 | }
 63 | 
 64 | #[inline]
 65 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = F32x4>) {
 66 |     for row in image_view.iter_rows_mut(0) {
 67 |         divide_alpha_row_inplace(row);
 68 |     }
 69 | }
 70 | 
 71 | #[inline(always)]
 72 | pub(crate) fn divide_alpha_row(src_row: &[F32x4], dst_row: &mut [F32x4]) {
 73 |     foreach_with_pre_reading(
 74 |         src_row.iter().zip(dst_row),
 75 |         |(&src_pixel, dst_pixel)| (src_pixel, dst_pixel),
 76 |         |(src_pixel, dst_pixel)| {
 77 |             let components = src_pixel.0;
 78 |             let alpha = components[3];
 79 |             if alpha.is_zero() {
 80 |                 dst_pixel.0 = [0.; 4];
 81 |             } else {
 82 |                 let recip_alpha = 1. / alpha;
 83 |                 dst_pixel.0 = [
 84 |                     components[0] * recip_alpha,
 85 |                     components[1] * recip_alpha,
 86 |                     components[2] * recip_alpha,
 87 |                     alpha,
 88 |                 ];
 89 |             }
 90 |         },
 91 |     );
 92 | }
 93 | 
 94 | #[inline(always)]
 95 | pub(crate) fn divide_alpha_row_inplace(row: &mut [F32x4]) {
 96 |     for pixel in row {
 97 |         let components = pixel.0;
 98 |         let alpha = components[3];
 99 |         if alpha.is_zero() {
100 |             pixel.0 = [0.; 4];
101 |         } else {
102 |             let recip_alpha = 1. / alpha;
103 |             pixel.0 = [
104 |                 components[0] * recip_alpha,
105 |                 components[1] * recip_alpha,
106 |                 components[2] * recip_alpha,
107 |                 alpha,
108 |             ];
109 |         }
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/alpha/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::{pixels, CpuExtensions, ImageError, ImageView, ImageViewMut};
 2 | 
 3 | #[macro_use]
 4 | mod common;
 5 | pub(crate) mod errors;
 6 | 
 7 | mod u8x4;
 8 | cfg_if::cfg_if! {
 9 |     if #[cfg(not(feature = "only_u8x4"))] {
10 |         mod u16x2;
11 |         mod u16x4;
12 |         mod u8x2;
13 |         mod f32x2;
14 |         mod f32x4;
15 |     }
16 | }
17 | 
18 | pub(crate) trait AlphaMulDiv: pixels::InnerPixel {
19 |     /// Multiplies RGB-channels of source image by alpha-channel and store
20 |     /// result into destination image.
21 |     #[allow(unused_variables)]
22 |     fn multiply_alpha(
23 |         src_view: &impl ImageView<Pixel = Self>,
24 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
25 |         cpu_extensions: CpuExtensions,
26 |     ) -> Result<(), ImageError> {
27 |         Err(ImageError::UnsupportedPixelType)
28 |     }
29 | 
30 |     /// Multiplies RGB-channels of image by alpha-channel inplace.
31 |     #[allow(unused_variables)]
32 |     fn multiply_alpha_inplace(
33 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
34 |         cpu_extensions: CpuExtensions,
35 |     ) -> Result<(), ImageError> {
36 |         Err(ImageError::UnsupportedPixelType)
37 |     }
38 | 
39 |     /// Divides RGB-channels of source image by alpha-channel and store
40 |     /// result into destination image.
41 |     #[allow(unused_variables)]
42 |     fn divide_alpha(
43 |         src_view: &impl ImageView<Pixel = Self>,
44 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
45 |         cpu_extensions: CpuExtensions,
46 |     ) -> Result<(), ImageError> {
47 |         Err(ImageError::UnsupportedPixelType)
48 |     }
49 | 
50 |     /// Divides RGB-channels of image by alpha-channel inplace.
51 |     #[allow(unused_variables)]
52 |     fn divide_alpha_inplace(
53 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
54 |         cpu_extensions: CpuExtensions,
55 |     ) -> Result<(), ImageError> {
56 |         Err(ImageError::UnsupportedPixelType)
57 |     }
58 | }
59 | 
60 | impl AlphaMulDiv for pixels::U8 {}
61 | impl AlphaMulDiv for pixels::U8x3 {}
62 | impl AlphaMulDiv for pixels::U16 {}
63 | impl AlphaMulDiv for pixels::U16x3 {}
64 | impl AlphaMulDiv for pixels::I32 {}
65 | impl AlphaMulDiv for pixels::F32 {}
66 | impl AlphaMulDiv for pixels::F32x3 {}
67 | 


--------------------------------------------------------------------------------
/src/alpha/u16x2/mod.rs:
--------------------------------------------------------------------------------
  1 | use crate::pixels::U16x2;
  2 | use crate::{CpuExtensions, ImageError, ImageView, ImageViewMut};
  3 | 
  4 | use super::AlphaMulDiv;
  5 | 
  6 | #[cfg(target_arch = "x86_64")]
  7 | mod avx2;
  8 | mod native;
  9 | #[cfg(target_arch = "aarch64")]
 10 | mod neon;
 11 | #[cfg(target_arch = "x86_64")]
 12 | mod sse4;
 13 | #[cfg(target_arch = "wasm32")]
 14 | mod wasm32;
 15 | 
 16 | type P = U16x2;
 17 | 
 18 | impl AlphaMulDiv for P {
 19 |     fn multiply_alpha(
 20 |         src_view: &impl ImageView<Pixel = Self>,
 21 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 22 |         cpu_extensions: CpuExtensions,
 23 |     ) -> Result<(), ImageError> {
 24 |         process_two_images! {
 25 |             multiple(src_view, dst_view, cpu_extensions);
 26 |         }
 27 |         Ok(())
 28 |     }
 29 | 
 30 |     fn multiply_alpha_inplace(
 31 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 32 |         cpu_extensions: CpuExtensions,
 33 |     ) -> Result<(), ImageError> {
 34 |         process_one_images! {
 35 |             multiply_inplace(image_view, cpu_extensions);
 36 |         }
 37 |         Ok(())
 38 |     }
 39 | 
 40 |     fn divide_alpha(
 41 |         src_view: &impl ImageView<Pixel = Self>,
 42 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 43 |         cpu_extensions: CpuExtensions,
 44 |     ) -> Result<(), ImageError> {
 45 |         process_two_images! {
 46 |             divide(src_view, dst_view, cpu_extensions);
 47 |         }
 48 |         Ok(())
 49 |     }
 50 | 
 51 |     fn divide_alpha_inplace(
 52 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 53 |         cpu_extensions: CpuExtensions,
 54 |     ) -> Result<(), ImageError> {
 55 |         process_one_images! {
 56 |             divide_inplace(image_view, cpu_extensions);
 57 |         }
 58 |         Ok(())
 59 |     }
 60 | }
 61 | 
 62 | fn multiple(
 63 |     src_view: &impl ImageView<Pixel = P>,
 64 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 65 |     cpu_extensions: CpuExtensions,
 66 | ) {
 67 |     match cpu_extensions {
 68 |         #[cfg(target_arch = "x86_64")]
 69 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) },
 70 |         #[cfg(target_arch = "x86_64")]
 71 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) },
 72 |         #[cfg(target_arch = "aarch64")]
 73 |         CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) },
 74 |         #[cfg(target_arch = "wasm32")]
 75 |         CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) },
 76 |         _ => native::multiply_alpha(src_view, dst_view),
 77 |     }
 78 | }
 79 | 
 80 | fn multiply_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
 81 |     match cpu_extensions {
 82 |         #[cfg(target_arch = "x86_64")]
 83 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) },
 84 |         #[cfg(target_arch = "x86_64")]
 85 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) },
 86 |         #[cfg(target_arch = "aarch64")]
 87 |         CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) },
 88 |         #[cfg(target_arch = "wasm32")]
 89 |         CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) },
 90 |         _ => native::multiply_alpha_inplace(image_view),
 91 |     }
 92 | }
 93 | 
 94 | fn divide(
 95 |     src_view: &impl ImageView<Pixel = P>,
 96 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 97 |     cpu_extensions: CpuExtensions,
 98 | ) {
 99 |     match cpu_extensions {
100 |         #[cfg(target_arch = "x86_64")]
101 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) },
102 |         #[cfg(target_arch = "x86_64")]
103 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) },
104 |         #[cfg(target_arch = "aarch64")]
105 |         CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) },
106 |         #[cfg(target_arch = "wasm32")]
107 |         CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) },
108 |         _ => native::divide_alpha(src_view, dst_view),
109 |     }
110 | }
111 | 
112 | fn divide_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
113 |     match cpu_extensions {
114 |         #[cfg(target_arch = "x86_64")]
115 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) },
116 |         #[cfg(target_arch = "x86_64")]
117 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) },
118 |         #[cfg(target_arch = "aarch64")]
119 |         CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) },
120 |         #[cfg(target_arch = "wasm32")]
121 |         CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) },
122 |         _ => native::divide_alpha_inplace(image_view),
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/alpha/u16x2/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::alpha::common::{div_and_clip16, mul_div_65535, RECIP_ALPHA16};
 2 | use crate::pixels::U16x2;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn multiply_alpha(
 6 |     src_view: &impl ImageView<Pixel = U16x2>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = U16x2>,
 8 | ) {
 9 |     let src_rows = src_view.iter_rows(0);
10 |     let dst_rows = dst_view.iter_rows_mut(0);
11 | 
12 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
13 |         multiply_alpha_row(src_row, dst_row);
14 |     }
15 | }
16 | 
17 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U16x2>) {
18 |     for row in image_view.iter_rows_mut(0) {
19 |         multiply_alpha_row_inplace(row);
20 |     }
21 | }
22 | 
23 | #[inline(always)]
24 | pub(crate) fn multiply_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) {
25 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
26 |         let components: [u16; 2] = src_pixel.0;
27 |         let alpha = components[1];
28 |         dst_pixel.0 = [mul_div_65535(components[0], alpha), alpha];
29 |     }
30 | }
31 | 
32 | #[inline(always)]
33 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U16x2]) {
34 |     for pixel in row {
35 |         let components: [u16; 2] = pixel.0;
36 |         let alpha = components[1];
37 |         pixel.0 = [mul_div_65535(components[0], alpha), alpha];
38 |     }
39 | }
40 | 
41 | // Divide
42 | 
43 | #[inline]
44 | pub(crate) fn divide_alpha(
45 |     src_view: &impl ImageView<Pixel = U16x2>,
46 |     dst_view: &mut impl ImageViewMut<Pixel = U16x2>,
47 | ) {
48 |     let src_rows = src_view.iter_rows(0);
49 |     let dst_rows = dst_view.iter_rows_mut(0);
50 | 
51 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
52 |         divide_alpha_row(src_row, dst_row);
53 |     }
54 | }
55 | 
56 | #[inline]
57 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U16x2>) {
58 |     for row in image_view.iter_rows_mut(0) {
59 |         divide_alpha_row_inplace(row);
60 |     }
61 | }
62 | 
63 | #[inline(always)]
64 | pub(crate) fn divide_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) {
65 |     src_row
66 |         .iter()
67 |         .zip(dst_row)
68 |         .for_each(|(src_pixel, dst_pixel)| {
69 |             let components: [u16; 2] = src_pixel.0;
70 |             let alpha = components[1];
71 |             let recip_alpha = RECIP_ALPHA16[alpha as usize];
72 |             dst_pixel.0 = [div_and_clip16(components[0], recip_alpha), alpha];
73 |         });
74 | }
75 | 
76 | #[inline(always)]
77 | pub(crate) fn divide_alpha_row_inplace(row: &mut [U16x2]) {
78 |     for pixel in row {
79 |         let components: [u16; 2] = pixel.0;
80 |         let alpha = components[1];
81 |         let recip_alpha = RECIP_ALPHA16[alpha as usize];
82 |         pixel.0 = [div_and_clip16(components[0], recip_alpha), alpha];
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/alpha/u16x4/native.rs:
--------------------------------------------------------------------------------
  1 | use crate::alpha::common::{div_and_clip16, mul_div_65535, RECIP_ALPHA16};
  2 | use crate::pixels::U16x4;
  3 | use crate::{ImageView, ImageViewMut};
  4 | 
  5 | pub(crate) fn multiply_alpha(
  6 |     src_view: &impl ImageView<Pixel = U16x4>,
  7 |     dst_view: &mut impl ImageViewMut<Pixel = U16x4>,
  8 | ) {
  9 |     let src_rows = src_view.iter_rows(0);
 10 |     let dst_rows = dst_view.iter_rows_mut(0);
 11 | 
 12 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
 13 |         multiply_alpha_row(src_row, dst_row);
 14 |     }
 15 | }
 16 | 
 17 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U16x4>) {
 18 |     for row in image_view.iter_rows_mut(0) {
 19 |         multiply_alpha_row_inplace(row);
 20 |     }
 21 | }
 22 | 
 23 | #[inline(always)]
 24 | pub(crate) fn multiply_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) {
 25 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
 26 |         let components: [u16; 4] = src_pixel.0;
 27 |         let alpha = components[3];
 28 |         dst_pixel.0 = [
 29 |             mul_div_65535(components[0], alpha),
 30 |             mul_div_65535(components[1], alpha),
 31 |             mul_div_65535(components[2], alpha),
 32 |             alpha,
 33 |         ];
 34 |     }
 35 | }
 36 | 
 37 | #[inline(always)]
 38 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U16x4]) {
 39 |     for pixel in row {
 40 |         let components: [u16; 4] = pixel.0;
 41 |         let alpha = components[3];
 42 |         pixel.0 = [
 43 |             mul_div_65535(components[0], alpha),
 44 |             mul_div_65535(components[1], alpha),
 45 |             mul_div_65535(components[2], alpha),
 46 |             alpha,
 47 |         ];
 48 |     }
 49 | }
 50 | 
 51 | // Divide
 52 | 
 53 | #[inline]
 54 | pub(crate) fn divide_alpha(
 55 |     src_view: &impl ImageView<Pixel = U16x4>,
 56 |     dst_view: &mut impl ImageViewMut<Pixel = U16x4>,
 57 | ) {
 58 |     let src_rows = src_view.iter_rows(0);
 59 |     let dst_rows = dst_view.iter_rows_mut(0);
 60 | 
 61 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
 62 |         divide_alpha_row(src_row, dst_row);
 63 |     }
 64 | }
 65 | 
 66 | #[inline]
 67 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U16x4>) {
 68 |     for row in image_view.iter_rows_mut(0) {
 69 |         divide_alpha_row_inplace(row);
 70 |     }
 71 | }
 72 | 
 73 | #[inline(always)]
 74 | pub(crate) fn divide_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) {
 75 |     src_row
 76 |         .iter()
 77 |         .zip(dst_row)
 78 |         .for_each(|(src_pixel, dst_pixel)| {
 79 |             let components: [u16; 4] = src_pixel.0;
 80 |             let alpha = components[3];
 81 |             let recip_alpha = RECIP_ALPHA16[alpha as usize];
 82 |             dst_pixel.0 = [
 83 |                 div_and_clip16(components[0], recip_alpha),
 84 |                 div_and_clip16(components[1], recip_alpha),
 85 |                 div_and_clip16(components[2], recip_alpha),
 86 |                 alpha,
 87 |             ];
 88 |         });
 89 | }
 90 | 
 91 | #[inline(always)]
 92 | pub(crate) fn divide_alpha_row_inplace(row: &mut [U16x4]) {
 93 |     row.iter_mut().for_each(|pixel| {
 94 |         let components: [u16; 4] = pixel.0;
 95 |         let alpha = components[3];
 96 |         let recip_alpha = RECIP_ALPHA16[alpha as usize];
 97 |         pixel.0 = [
 98 |             div_and_clip16(components[0], recip_alpha),
 99 |             div_and_clip16(components[1], recip_alpha),
100 |             div_and_clip16(components[2], recip_alpha),
101 |             alpha,
102 |         ];
103 |     });
104 | }
105 | 


--------------------------------------------------------------------------------
/src/alpha/u8x2/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::alpha::common::{div_and_clip, mul_div_255, RECIP_ALPHA};
 2 | use crate::pixels::U8x2;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn multiply_alpha(
 6 |     src_view: &impl ImageView<Pixel = U8x2>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = U8x2>,
 8 | ) {
 9 |     let src_rows = src_view.iter_rows(0);
10 |     let dst_rows = dst_view.iter_rows_mut(0);
11 | 
12 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
13 |         multiply_alpha_row(src_row, dst_row);
14 |     }
15 | }
16 | 
17 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U8x2>) {
18 |     for row in image_view.iter_rows_mut(0) {
19 |         multiply_alpha_row_inplace(row);
20 |     }
21 | }
22 | 
23 | #[inline(always)]
24 | pub(crate) fn multiply_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) {
25 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
26 |         let components: [u8; 2] = src_pixel.0;
27 |         let alpha = components[1];
28 |         dst_pixel.0 = [mul_div_255(components[0], alpha), alpha];
29 |     }
30 | }
31 | 
32 | #[inline(always)]
33 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U8x2]) {
34 |     for pixel in row {
35 |         let components: [u8; 2] = pixel.0;
36 |         let alpha = components[1];
37 |         pixel.0 = [mul_div_255(components[0], alpha), alpha];
38 |     }
39 | }
40 | 
41 | // Divide
42 | 
43 | #[inline]
44 | pub(crate) fn divide_alpha(
45 |     src_view: &impl ImageView<Pixel = U8x2>,
46 |     dst_view: &mut impl ImageViewMut<Pixel = U8x2>,
47 | ) {
48 |     let src_rows = src_view.iter_rows(0);
49 |     let dst_rows = dst_view.iter_rows_mut(0);
50 | 
51 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
52 |         divide_alpha_row(src_row, dst_row);
53 |     }
54 | }
55 | 
56 | #[inline]
57 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U8x2>) {
58 |     for dst_row in image_view.iter_rows_mut(0) {
59 |         let src_row = unsafe { std::slice::from_raw_parts(dst_row.as_ptr(), dst_row.len()) };
60 |         divide_alpha_row(src_row, dst_row);
61 |     }
62 | }
63 | 
64 | #[inline(always)]
65 | pub(crate) fn divide_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) {
66 |     src_row
67 |         .iter()
68 |         .zip(dst_row)
69 |         .for_each(|(src_pixel, dst_pixel)| {
70 |             let components: [u8; 2] = src_pixel.0;
71 |             let alpha = components[1];
72 |             let recip_alpha = RECIP_ALPHA[alpha as usize];
73 |             dst_pixel.0 = [div_and_clip(components[0], recip_alpha), alpha];
74 |         });
75 | }
76 | 


--------------------------------------------------------------------------------
/src/alpha/u8x4/mod.rs:
--------------------------------------------------------------------------------
  1 | use super::AlphaMulDiv;
  2 | use crate::pixels::U8x4;
  3 | use crate::{CpuExtensions, ImageError, ImageView, ImageViewMut};
  4 | 
  5 | #[cfg(target_arch = "x86_64")]
  6 | mod avx2;
  7 | mod native;
  8 | #[cfg(target_arch = "aarch64")]
  9 | mod neon;
 10 | #[cfg(target_arch = "x86_64")]
 11 | mod sse4;
 12 | #[cfg(target_arch = "wasm32")]
 13 | mod wasm32;
 14 | 
 15 | type P = U8x4;
 16 | 
 17 | impl AlphaMulDiv for P {
 18 |     fn multiply_alpha(
 19 |         src_view: &impl ImageView<Pixel = Self>,
 20 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 21 |         cpu_extensions: CpuExtensions,
 22 |     ) -> Result<(), ImageError> {
 23 |         process_two_images! {
 24 |             multiple(src_view, dst_view, cpu_extensions);
 25 |         }
 26 |         Ok(())
 27 |     }
 28 | 
 29 |     fn multiply_alpha_inplace(
 30 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 31 |         cpu_extensions: CpuExtensions,
 32 |     ) -> Result<(), ImageError> {
 33 |         process_one_images! {
 34 |             multiply_inplace(image_view, cpu_extensions);
 35 |         }
 36 |         Ok(())
 37 |     }
 38 | 
 39 |     fn divide_alpha(
 40 |         src_view: &impl ImageView<Pixel = Self>,
 41 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
 42 |         cpu_extensions: CpuExtensions,
 43 |     ) -> Result<(), ImageError> {
 44 |         process_two_images! {
 45 |             divide(src_view, dst_view, cpu_extensions);
 46 |         }
 47 |         Ok(())
 48 |     }
 49 | 
 50 |     fn divide_alpha_inplace(
 51 |         image_view: &mut impl ImageViewMut<Pixel = Self>,
 52 |         cpu_extensions: CpuExtensions,
 53 |     ) -> Result<(), ImageError> {
 54 |         process_one_images! {
 55 |             divide_inplace(image_view, cpu_extensions);
 56 |         }
 57 |         Ok(())
 58 |     }
 59 | }
 60 | 
 61 | fn multiple(
 62 |     src_view: &impl ImageView<Pixel = P>,
 63 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 64 |     cpu_extensions: CpuExtensions,
 65 | ) {
 66 |     match cpu_extensions {
 67 |         #[cfg(target_arch = "x86_64")]
 68 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha(src_view, dst_view) },
 69 |         #[cfg(target_arch = "x86_64")]
 70 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_view, dst_view) },
 71 |         #[cfg(target_arch = "aarch64")]
 72 |         CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_view, dst_view) },
 73 |         #[cfg(target_arch = "wasm32")]
 74 |         CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha(src_view, dst_view) },
 75 |         _ => native::multiply_alpha(src_view, dst_view),
 76 |     }
 77 | }
 78 | 
 79 | fn multiply_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
 80 |     match cpu_extensions {
 81 |         #[cfg(target_arch = "x86_64")]
 82 |         CpuExtensions::Avx2 => unsafe { avx2::multiply_alpha_inplace(image_view) },
 83 |         #[cfg(target_arch = "x86_64")]
 84 |         CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image_view) },
 85 |         #[cfg(target_arch = "aarch64")]
 86 |         CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image_view) },
 87 |         #[cfg(target_arch = "wasm32")]
 88 |         CpuExtensions::Simd128 => unsafe { wasm32::multiply_alpha_inplace(image_view) },
 89 |         _ => native::multiply_alpha_inplace(image_view),
 90 |     }
 91 | }
 92 | 
 93 | fn divide(
 94 |     src_view: &impl ImageView<Pixel = P>,
 95 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
 96 |     cpu_extensions: CpuExtensions,
 97 | ) {
 98 |     match cpu_extensions {
 99 |         #[cfg(target_arch = "x86_64")]
100 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha(src_view, dst_view) },
101 |         #[cfg(target_arch = "x86_64")]
102 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_view, dst_view) },
103 |         #[cfg(target_arch = "aarch64")]
104 |         CpuExtensions::Neon => unsafe { neon::divide_alpha(src_view, dst_view) },
105 |         #[cfg(target_arch = "wasm32")]
106 |         CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha(src_view, dst_view) },
107 |         _ => native::divide_alpha(src_view, dst_view),
108 |     }
109 | }
110 | 
111 | fn divide_inplace(image_view: &mut impl ImageViewMut<Pixel = P>, cpu_extensions: CpuExtensions) {
112 |     match cpu_extensions {
113 |         #[cfg(target_arch = "x86_64")]
114 |         CpuExtensions::Avx2 => unsafe { avx2::divide_alpha_inplace(image_view) },
115 |         #[cfg(target_arch = "x86_64")]
116 |         CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image_view) },
117 |         #[cfg(target_arch = "aarch64")]
118 |         CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image_view) },
119 |         #[cfg(target_arch = "wasm32")]
120 |         CpuExtensions::Simd128 => unsafe { wasm32::divide_alpha_inplace(image_view) },
121 |         _ => native::divide_alpha_inplace(image_view),
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/src/alpha/u8x4/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::alpha::common::{div_and_clip, mul_div_255, RECIP_ALPHA};
 2 | use crate::pixels::U8x4;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn multiply_alpha(
 6 |     src_view: &impl ImageView<Pixel = U8x4>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = U8x4>,
 8 | ) {
 9 |     let src_rows = src_view.iter_rows(0);
10 |     let dst_rows = dst_view.iter_rows_mut(0);
11 |     let rows = src_rows.zip(dst_rows);
12 | 
13 |     for (src_row, dst_row) in rows {
14 |         for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row.iter_mut()) {
15 |             *dst_pixel = multiply_alpha_pixel(*src_pixel);
16 |         }
17 |     }
18 | }
19 | 
20 | pub(crate) fn multiply_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U8x4>) {
21 |     let rows = image_view.iter_rows_mut(0);
22 |     for row in rows {
23 |         multiply_alpha_row_inplace(row);
24 |     }
25 | }
26 | 
27 | #[inline(always)]
28 | pub(crate) fn multiply_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) {
29 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
30 |         *dst_pixel = multiply_alpha_pixel(*src_pixel);
31 |     }
32 | }
33 | 
34 | #[inline(always)]
35 | pub(crate) fn multiply_alpha_row_inplace(row: &mut [U8x4]) {
36 |     for pixel in row.iter_mut() {
37 |         *pixel = multiply_alpha_pixel(*pixel);
38 |     }
39 | }
40 | 
41 | #[inline(always)]
42 | fn multiply_alpha_pixel(mut pixel: U8x4) -> U8x4 {
43 |     let alpha = pixel.0[3];
44 |     pixel.0 = [
45 |         mul_div_255(pixel.0[0], alpha),
46 |         mul_div_255(pixel.0[1], alpha),
47 |         mul_div_255(pixel.0[2], alpha),
48 |         alpha,
49 |     ];
50 |     pixel
51 | }
52 | 
53 | // Divide
54 | 
55 | #[inline]
56 | pub(crate) fn divide_alpha(
57 |     src_view: &impl ImageView<Pixel = U8x4>,
58 |     dst_view: &mut impl ImageViewMut<Pixel = U8x4>,
59 | ) {
60 |     let src_rows = src_view.iter_rows(0);
61 |     let dst_rows = dst_view.iter_rows_mut(0);
62 |     let rows = src_rows.zip(dst_rows);
63 |     for (src_row, dst_row) in rows {
64 |         divide_alpha_row(src_row, dst_row);
65 |     }
66 | }
67 | 
68 | #[inline]
69 | pub(crate) fn divide_alpha_inplace(image_view: &mut impl ImageViewMut<Pixel = U8x4>) {
70 |     let rows = image_view.iter_rows_mut(0);
71 |     for row in rows {
72 |         row.iter_mut().for_each(|pixel| {
73 |             *pixel = divide_alpha_pixel(*pixel);
74 |         });
75 |     }
76 | }
77 | 
78 | #[inline(always)]
79 | pub(crate) fn divide_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) {
80 |     for (src_pixel, dst_pixel) in src_row.iter().zip(dst_row) {
81 |         *dst_pixel = divide_alpha_pixel(*src_pixel);
82 |     }
83 | }
84 | 
85 | #[inline(always)]
86 | fn divide_alpha_pixel(mut pixel: U8x4) -> U8x4 {
87 |     let alpha = pixel.0[3];
88 |     let recip_alpha = RECIP_ALPHA[alpha as usize];
89 |     pixel.0 = [
90 |         div_and_clip(pixel.0[0], recip_alpha),
91 |         div_and_clip(pixel.0[1], recip_alpha),
92 |         div_and_clip(pixel.0[2], recip_alpha),
93 |         alpha,
94 |     ];
95 |     pixel
96 | }
97 | 


--------------------------------------------------------------------------------
/src/color/mappers.rs:
--------------------------------------------------------------------------------
 1 | use crate::PixelComponentMapper;
 2 | 
 3 | fn gamma_into_linear(input: f32) -> f32 {
 4 |     input.powf(2.2)
 5 | }
 6 | 
 7 | fn linear_into_gamma(input: f32) -> f32 {
 8 |     input.powf(1.0 / 2.2)
 9 | }
10 | 
11 | /// Create mapper to convert an image from Gamma 2.2 to linear colorspace and back.
12 | pub fn create_gamma_22_mapper() -> PixelComponentMapper {
13 |     PixelComponentMapper::new(gamma_into_linear, linear_into_gamma)
14 | }
15 | 
16 | /// https://en.wikipedia.org/wiki/SRGB#From_sRGB_to_CIE_XYZ
17 | /// http://www.ericbrasseur.org/gamma.html?i=2#formulas
18 | fn srgb_to_linear(input: f32) -> f32 {
19 |     if input < 0.04045 {
20 |         input / 12.92
21 |     } else {
22 |         const A: f32 = 0.055;
23 |         ((input + A) / (1. + A)).powf(2.4)
24 |     }
25 | }
26 | 
27 | /// https://en.wikipedia.org/wiki/SRGB#From_CIE_XYZ_to_sRGB
28 | /// http://www.ericbrasseur.org/gamma.html?i=2#formulas
29 | fn linear_to_srgb(input: f32) -> f32 {
30 |     if input < 0.0031308 {
31 |         12.92 * input
32 |     } else {
33 |         const A: f32 = 0.055;
34 |         (1. + A) * input.powf(1. / 2.4) - A
35 |     }
36 | }
37 | 
38 | /// Create mapper to convert an image from sRGB to linear RGB colorspace and back.
39 | pub fn create_srgb_mapper() -> PixelComponentMapper {
40 |     PixelComponentMapper::new(srgb_to_linear, linear_to_srgb)
41 | }
42 | 


--------------------------------------------------------------------------------
/src/convolution/f32x1/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::vertical_f32::vert_convolution_f32;
 3 | use crate::cpu_extensions::CpuExtensions;
 4 | use crate::pixels::F32;
 5 | use crate::{ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | // #[cfg(target_arch = "aarch64")]
11 | // mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | // #[cfg(target_arch = "wasm32")]
15 | // mod wasm32;
16 | 
17 | type P = F32;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 |         let coeffs_ref = &coeffs;
29 | 
30 |         try_process_in_threads_h! {
31 |             horiz_convolution(
32 |                 src_view,
33 |                 dst_view,
34 |                 offset,
35 |                 coeffs_ref,
36 |                 cpu_extensions,
37 |             );
38 |         }
39 |     }
40 | 
41 |     fn vert_convolution(
42 |         src_view: &impl ImageView<Pixel = Self>,
43 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
44 |         offset: u32,
45 |         coeffs: Coefficients,
46 |         cpu_extensions: CpuExtensions,
47 |     ) {
48 |         debug_assert!(src_view.width() - offset >= dst_view.width());
49 | 
50 |         let coeffs_ref = &coeffs;
51 | 
52 |         try_process_in_threads_v! {
53 |             vert_convolution(
54 |                 src_view,
55 |                 dst_view,
56 |                 offset,
57 |                 coeffs_ref,
58 |                 cpu_extensions,
59 |             );
60 |         }
61 |     }
62 | }
63 | 
64 | fn horiz_convolution(
65 |     src_view: &impl ImageView<Pixel = P>,
66 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
67 |     offset: u32,
68 |     coeffs: &Coefficients,
69 |     cpu_extensions: CpuExtensions,
70 | ) {
71 |     match cpu_extensions {
72 |         #[cfg(target_arch = "x86_64")]
73 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs),
74 |         #[cfg(target_arch = "x86_64")]
75 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs),
76 |         // #[cfg(target_arch = "aarch64")]
77 |         // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs),
78 |         // #[cfg(target_arch = "wasm32")]
79 |         // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs),
80 |         _ => native::horiz_convolution(src_view, dst_view, offset, coeffs),
81 |     }
82 | }
83 | 
84 | fn vert_convolution(
85 |     src_view: &impl ImageView<Pixel = P>,
86 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
87 |     offset: u32,
88 |     coeffs: &Coefficients,
89 |     cpu_extensions: CpuExtensions,
90 | ) {
91 |     vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions);
92 | }
93 | 


--------------------------------------------------------------------------------
/src/convolution/f32x1/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::Coefficients;
 2 | use crate::pixels::F32;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn horiz_convolution(
 6 |     src_view: &impl ImageView<Pixel = F32>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = F32>,
 8 |     offset: u32,
 9 |     coeffs: &Coefficients,
10 | ) {
11 |     let coefficients_chunks = coeffs.get_chunks();
12 |     let src_rows = src_view.iter_rows(offset);
13 |     let dst_rows = dst_view.iter_rows_mut(0);
14 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
15 |         for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) {
16 |             let first_x_src = coeffs_chunk.start as usize;
17 |             let end_x_src = first_x_src + coeffs_chunk.values.len();
18 |             let mut ss = 0.;
19 |             let mut src_pixels = unsafe { src_row.get_unchecked(first_x_src..end_x_src) };
20 |             let mut coefs = coeffs_chunk.values;
21 | 
22 |             (coefs, src_pixels) = convolution_by_chunks::<8>(coefs, src_pixels, &mut ss);
23 | 
24 |             for (&k, &pixel) in coefs.iter().zip(src_pixels) {
25 |                 ss += pixel.0 as f64 * k;
26 |             }
27 |             dst_pixel.0 = ss as f32;
28 |         }
29 |     }
30 | }
31 | 
32 | #[inline(always)]
33 | fn convolution_by_chunks<'a, 'b, const CHUNK_SIZE: usize>(
34 |     coefs: &'a [f64],
35 |     src_pixels: &'b [F32],
36 |     ss: &mut f64,
37 | ) -> (&'a [f64], &'b [F32]) {
38 |     let coef_chunks = coefs.chunks_exact(CHUNK_SIZE);
39 |     let coefs = coef_chunks.remainder();
40 |     let pixel_chunks = src_pixels.chunks_exact(CHUNK_SIZE);
41 |     let src_pixels = pixel_chunks.remainder();
42 |     for (ks, pixels) in coef_chunks.zip(pixel_chunks) {
43 |         for (&k, &pixel) in ks.iter().zip(pixels) {
44 |             *ss += pixel.0 as f64 * k;
45 |         }
46 |     }
47 |     (coefs, src_pixels)
48 | }
49 | 


--------------------------------------------------------------------------------
/src/convolution/f32x1/sse4.rs:
--------------------------------------------------------------------------------
  1 | use std::arch::x86_64::*;
  2 | 
  3 | use crate::convolution::{Coefficients, CoefficientsChunk};
  4 | use crate::pixels::F32;
  5 | use crate::{simd_utils, ImageView, ImageViewMut};
  6 | 
  7 | #[inline]
  8 | pub(crate) fn horiz_convolution(
  9 |     src_view: &impl ImageView<Pixel = F32>,
 10 |     dst_view: &mut impl ImageViewMut<Pixel = F32>,
 11 |     offset: u32,
 12 |     coeffs: &Coefficients,
 13 | ) {
 14 |     let coefficients_chunks = coeffs.get_chunks();
 15 |     let dst_height = dst_view.height();
 16 | 
 17 |     let src_iter = src_view.iter_4_rows(offset, dst_height + offset);
 18 |     let dst_iter = dst_view.iter_4_rows_mut();
 19 |     for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
 20 |         unsafe {
 21 |             horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks);
 22 |         }
 23 |     }
 24 | 
 25 |     let yy = dst_height - dst_height % 4;
 26 |     let src_rows = src_view.iter_rows(yy + offset);
 27 |     let dst_rows = dst_view.iter_rows_mut(yy);
 28 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
 29 |         unsafe {
 30 |             horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks);
 31 |         }
 32 |     }
 33 | }
 34 | 
 35 | /// For safety, it is necessary to ensure the following conditions:
 36 | /// - length of all rows in src_rows must be equal
 37 | /// - length of all rows in dst_rows must be equal
 38 | /// - coefficients_chunks.len() == dst_rows.0.len()
 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 40 | /// - precision <= MAX_COEFS_PRECISION
 41 | #[target_feature(enable = "sse4.1")]
 42 | unsafe fn horiz_convolution_rows<const ROWS_COUNT: usize>(
 43 |     src_rows: [&[F32]; ROWS_COUNT],
 44 |     dst_rows: [&mut [F32]; ROWS_COUNT],
 45 |     coefficients_chunks: &[CoefficientsChunk],
 46 | ) {
 47 |     let mut ll_buf = [0f64; 2];
 48 | 
 49 |     for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
 50 |         let mut x: usize = coeffs_chunk.start as usize;
 51 |         let mut sums = [_mm_set1_pd(0.); ROWS_COUNT];
 52 | 
 53 |         let mut coeffs = coeffs_chunk.values;
 54 | 
 55 |         let coeffs_by_4 = coeffs.chunks_exact(4);
 56 |         coeffs = coeffs_by_4.remainder();
 57 | 
 58 |         for k in coeffs_by_4 {
 59 |             let coeff01_f64x2 = simd_utils::loadu_pd(k, 0);
 60 |             let coeff23_f64x2 = simd_utils::loadu_pd(k, 2);
 61 | 
 62 |             for i in 0..ROWS_COUNT {
 63 |                 let mut sum = sums[i];
 64 |                 let source = simd_utils::loadu_ps(src_rows[i], x);
 65 | 
 66 |                 let pixel01_f64 = _mm_cvtps_pd(source);
 67 |                 sum = _mm_add_pd(sum, _mm_mul_pd(pixel01_f64, coeff01_f64x2));
 68 | 
 69 |                 let pixel23_f64 = _mm_cvtps_pd(_mm_movehl_ps(source, source));
 70 |                 sum = _mm_add_pd(sum, _mm_mul_pd(pixel23_f64, coeff23_f64x2));
 71 | 
 72 |                 sums[i] = sum;
 73 |             }
 74 |             x += 4;
 75 |         }
 76 | 
 77 |         let coeffs_by_2 = coeffs.chunks_exact(2);
 78 |         coeffs = coeffs_by_2.remainder();
 79 |         for k in coeffs_by_2 {
 80 |             let coeff01_f64x2 = simd_utils::loadu_pd(k, 0);
 81 | 
 82 |             for i in 0..ROWS_COUNT {
 83 |                 let pixel0 = src_rows[i].get_unchecked(x).0;
 84 |                 let pixel1 = src_rows[i].get_unchecked(x + 1).0;
 85 |                 let pixel01_f64 = _mm_set_pd(pixel1 as f64, pixel0 as f64);
 86 |                 sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(pixel01_f64, coeff01_f64x2));
 87 |             }
 88 |             x += 2;
 89 |         }
 90 | 
 91 |         if let Some(&k) = coeffs.first() {
 92 |             let coeff0_f64x2 = _mm_set1_pd(k);
 93 | 
 94 |             for i in 0..ROWS_COUNT {
 95 |                 let pixel0 = src_rows[i].get_unchecked(x).0;
 96 |                 let pixel0_f64 = _mm_set_pd(0., pixel0 as f64);
 97 |                 sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(pixel0_f64, coeff0_f64x2));
 98 |             }
 99 |         }
100 | 
101 |         for i in 0..ROWS_COUNT {
102 |             _mm_storeu_pd(ll_buf.as_mut_ptr(), sums[i]);
103 |             let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
104 |             dst_pixel.0 = (ll_buf[0] + ll_buf[1]) as f32;
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/convolution/f32x2/avx2.rs:
--------------------------------------------------------------------------------
  1 | use std::arch::x86_64::*;
  2 | 
  3 | use crate::convolution::{Coefficients, CoefficientsChunk};
  4 | use crate::pixels::F32x2;
  5 | use crate::{simd_utils, ImageView, ImageViewMut};
  6 | 
  7 | #[inline]
  8 | pub(crate) fn horiz_convolution(
  9 |     src_view: &impl ImageView<Pixel = F32x2>,
 10 |     dst_view: &mut impl ImageViewMut<Pixel = F32x2>,
 11 |     offset: u32,
 12 |     coeffs: &Coefficients,
 13 | ) {
 14 |     let coefficients_chunks = coeffs.get_chunks();
 15 |     let dst_height = dst_view.height();
 16 | 
 17 |     let src_iter = src_view.iter_4_rows(offset, dst_height + offset);
 18 |     let dst_iter = dst_view.iter_4_rows_mut();
 19 |     for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
 20 |         unsafe {
 21 |             horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks);
 22 |         }
 23 |     }
 24 | 
 25 |     let yy = dst_height - dst_height % 4;
 26 |     let src_rows = src_view.iter_rows(yy + offset);
 27 |     let dst_rows = dst_view.iter_rows_mut(yy);
 28 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
 29 |         unsafe {
 30 |             horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks);
 31 |         }
 32 |     }
 33 | }
 34 | 
 35 | /// For safety, it is necessary to ensure the following conditions:
 36 | /// - length of all rows in src_rows must be equal
 37 | /// - length of all rows in dst_rows must be equal
 38 | /// - coefficients_chunks.len() == dst_rows.0.len()
 39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 40 | /// - precision <= MAX_COEFS_PRECISION
 41 | #[target_feature(enable = "avx2")]
 42 | unsafe fn horiz_convolution_rows<const ROWS_COUNT: usize>(
 43 |     src_rows: [&[F32x2]; ROWS_COUNT],
 44 |     dst_rows: [&mut [F32x2]; ROWS_COUNT],
 45 |     coefficients_chunks: &[CoefficientsChunk],
 46 | ) {
 47 |     let mut ll_buf = [0f64; 2];
 48 | 
 49 |     for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
 50 |         let mut x: usize = coeffs_chunk.start as usize;
 51 |         let mut ll_sum = [_mm256_set1_pd(0.); ROWS_COUNT];
 52 | 
 53 |         let mut coeffs = coeffs_chunk.values;
 54 | 
 55 |         let coeffs_by_4 = coeffs.chunks_exact(4);
 56 |         coeffs = coeffs_by_4.remainder();
 57 |         for k in coeffs_by_4 {
 58 |             let coeff0_f64x4 = _mm256_set_pd(k[1], k[1], k[0], k[0]);
 59 |             let coeff1_f64x4 = _mm256_set_pd(k[3], k[3], k[2], k[2]);
 60 | 
 61 |             for i in 0..ROWS_COUNT {
 62 |                 let mut sum = ll_sum[i];
 63 |                 let pixels04_f32x8 = simd_utils::loadu_ps256(src_rows[i], x);
 64 | 
 65 |                 let pixels01_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<0>(pixels04_f32x8));
 66 |                 sum = _mm256_add_pd(sum, _mm256_mul_pd(pixels01_f64x4, coeff0_f64x4));
 67 | 
 68 |                 let pixels23_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<1>(pixels04_f32x8));
 69 |                 sum = _mm256_add_pd(sum, _mm256_mul_pd(pixels23_f64x4, coeff1_f64x4));
 70 | 
 71 |                 ll_sum[i] = sum;
 72 |             }
 73 |             x += 4;
 74 |         }
 75 | 
 76 |         let coeffs_by_2 = coeffs.chunks_exact(2);
 77 |         coeffs = coeffs_by_2.remainder();
 78 |         for k in coeffs_by_2 {
 79 |             let coeff_f64x4 = _mm256_set_pd(k[1], k[1], k[0], k[0]);
 80 | 
 81 |             for i in 0..ROWS_COUNT {
 82 |                 let mut sum = ll_sum[i];
 83 |                 let pixels01_f32x4 = simd_utils::loadu_ps(src_rows[i], x);
 84 | 
 85 |                 let pixels01_f64x4 = _mm256_cvtps_pd(pixels01_f32x4);
 86 |                 sum = _mm256_add_pd(sum, _mm256_mul_pd(pixels01_f64x4, coeff_f64x4));
 87 | 
 88 |                 ll_sum[i] = sum;
 89 |             }
 90 |             x += 2;
 91 |         }
 92 | 
 93 |         if let Some(&k) = coeffs.first() {
 94 |             let coeff0_f64x4 = _mm256_set1_pd(k);
 95 | 
 96 |             for i in 0..ROWS_COUNT {
 97 |                 let mut sum = ll_sum[i];
 98 |                 let pixel = src_rows[i].get_unchecked(x);
 99 | 
100 |                 let pixel0_f64x4 = _mm256_set_pd(0., 0., pixel.0[1] as f64, pixel.0[0] as f64);
101 |                 sum = _mm256_add_pd(sum, _mm256_mul_pd(pixel0_f64x4, coeff0_f64x4));
102 | 
103 |                 ll_sum[i] = sum;
104 |             }
105 |         }
106 | 
107 |         for i in 0..ROWS_COUNT {
108 |             let sum_f64x2 = _mm_add_pd(
109 |                 _mm256_extractf128_pd::<0>(ll_sum[i]),
110 |                 _mm256_extractf128_pd::<1>(ll_sum[i]),
111 |             );
112 |             _mm_storeu_pd(ll_buf.as_mut_ptr(), sum_f64x2);
113 |             let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
114 |             dst_pixel.0 = ll_buf.map(|v| v as f32);
115 |         }
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/convolution/f32x2/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::vertical_f32::vert_convolution_f32;
 2 | use crate::cpu_extensions::CpuExtensions;
 3 | use crate::pixels::F32x2;
 4 | use crate::{ImageView, ImageViewMut};
 5 | 
 6 | use super::{Coefficients, Convolution};
 7 | 
 8 | #[cfg(target_arch = "x86_64")]
 9 | mod avx2;
10 | mod native;
11 | // #[cfg(target_arch = "aarch64")]
12 | // mod neon;
13 | #[cfg(target_arch = "x86_64")]
14 | mod sse4;
15 | // #[cfg(target_arch = "wasm32")]
16 | // mod wasm32;
17 | 
18 | type P = F32x2;
19 | 
20 | impl Convolution for P {
21 |     fn horiz_convolution(
22 |         src_view: &impl ImageView<Pixel = Self>,
23 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
24 |         offset: u32,
25 |         coeffs: Coefficients,
26 |         cpu_extensions: CpuExtensions,
27 |     ) {
28 |         debug_assert!(src_view.height() - offset >= dst_view.height());
29 |         let coeffs_ref = &coeffs;
30 | 
31 |         try_process_in_threads_h! {
32 |             horiz_convolution(
33 |                 src_view,
34 |                 dst_view,
35 |                 offset,
36 |                 coeffs_ref,
37 |                 cpu_extensions,
38 |             );
39 |         }
40 |     }
41 | 
42 |     fn vert_convolution(
43 |         src_view: &impl ImageView<Pixel = Self>,
44 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
45 |         offset: u32,
46 |         coeffs: Coefficients,
47 |         cpu_extensions: CpuExtensions,
48 |     ) {
49 |         debug_assert!(src_view.width() - offset >= dst_view.width());
50 | 
51 |         let coeffs_ref = &coeffs;
52 | 
53 |         try_process_in_threads_v! {
54 |             vert_convolution(
55 |                 src_view,
56 |                 dst_view,
57 |                 offset,
58 |                 coeffs_ref,
59 |                 cpu_extensions,
60 |             );
61 |         }
62 |     }
63 | }
64 | 
65 | fn horiz_convolution(
66 |     src_view: &impl ImageView<Pixel = P>,
67 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
68 |     offset: u32,
69 |     coeffs: &Coefficients,
70 |     cpu_extensions: CpuExtensions,
71 | ) {
72 |     match cpu_extensions {
73 |         #[cfg(target_arch = "x86_64")]
74 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs),
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs),
77 |         // #[cfg(target_arch = "aarch64")]
78 |         // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs),
79 |         // #[cfg(target_arch = "wasm32")]
80 |         // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs),
81 |         _ => native::horiz_convolution(src_view, dst_view, offset, coeffs),
82 |     }
83 | }
84 | 
85 | fn vert_convolution(
86 |     src_view: &impl ImageView<Pixel = P>,
87 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
88 |     offset: u32,
89 |     coeffs: &Coefficients,
90 |     cpu_extensions: CpuExtensions,
91 | ) {
92 |     vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions);
93 | }
94 | 


--------------------------------------------------------------------------------
/src/convolution/f32x2/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::Coefficients;
 2 | use crate::pixels::F32x2;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn horiz_convolution(
 6 |     src_view: &impl ImageView<Pixel = F32x2>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = F32x2>,
 8 |     offset: u32,
 9 |     coeffs: &Coefficients,
10 | ) {
11 |     let coefficients_chunks = coeffs.get_chunks();
12 |     let src_rows = src_view.iter_rows(offset);
13 |     let dst_rows = dst_view.iter_rows_mut(0);
14 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
15 |         for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) {
16 |             let first_x_src = coeffs_chunk.start as usize;
17 |             let mut ss = [0.; 2];
18 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
19 |             for (&k, &src_pixel) in coeffs_chunk.values.iter().zip(src_pixels) {
20 |                 for (s, c) in ss.iter_mut().zip(src_pixel.0) {
21 |                     *s += c as f64 * k;
22 |                 }
23 |             }
24 |             dst_pixel.0 = ss.map(|v| v as f32);
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/convolution/f32x2/sse4.rs:
--------------------------------------------------------------------------------
 1 | use std::arch::x86_64::*;
 2 | 
 3 | use crate::convolution::{Coefficients, CoefficientsChunk};
 4 | use crate::pixels::F32x2;
 5 | use crate::{simd_utils, ImageView, ImageViewMut};
 6 | 
 7 | #[inline]
 8 | pub(crate) fn horiz_convolution(
 9 |     src_view: &impl ImageView<Pixel = F32x2>,
10 |     dst_view: &mut impl ImageViewMut<Pixel = F32x2>,
11 |     offset: u32,
12 |     coeffs: &Coefficients,
13 | ) {
14 |     let coefficients_chunks = coeffs.get_chunks();
15 |     let dst_height = dst_view.height();
16 | 
17 |     let src_iter = src_view.iter_4_rows(offset, dst_height + offset);
18 |     let dst_iter = dst_view.iter_4_rows_mut();
19 |     for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
20 |         unsafe {
21 |             horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks);
22 |         }
23 |     }
24 | 
25 |     let yy = dst_height - dst_height % 4;
26 |     let src_rows = src_view.iter_rows(yy + offset);
27 |     let dst_rows = dst_view.iter_rows_mut(yy);
28 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
29 |         unsafe {
30 |             horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks);
31 |         }
32 |     }
33 | }
34 | 
35 | /// For safety, it is necessary to ensure the following conditions:
36 | /// - length of all rows in src_rows must be equal
37 | /// - length of all rows in dst_rows must be equal
38 | /// - coefficients_chunks.len() == dst_rows.0.len()
39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
40 | /// - precision <= MAX_COEFS_PRECISION
41 | #[target_feature(enable = "sse4.1")]
42 | unsafe fn horiz_convolution_rows<const ROWS_COUNT: usize>(
43 |     src_rows: [&[F32x2]; ROWS_COUNT],
44 |     dst_rows: [&mut [F32x2]; ROWS_COUNT],
45 |     coefficients_chunks: &[CoefficientsChunk],
46 | ) {
47 |     let mut ll_buf = [0f64; 2];
48 | 
49 |     for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
50 |         let mut x: usize = coeffs_chunk.start as usize;
51 |         let mut ll_sum = [_mm_set1_pd(0.); ROWS_COUNT];
52 | 
53 |         let mut coeffs = coeffs_chunk.values;
54 | 
55 |         let coeffs_by_2 = coeffs.chunks_exact(2);
56 |         coeffs = coeffs_by_2.remainder();
57 | 
58 |         for k in coeffs_by_2 {
59 |             let coeff0_f64x2 = _mm_set1_pd(k[0]);
60 |             let coeff1_f64x2 = _mm_set1_pd(k[1]);
61 | 
62 |             for i in 0..ROWS_COUNT {
63 |                 let mut sum = ll_sum[i];
64 |                 let source = simd_utils::loadu_ps(src_rows[i], x);
65 | 
66 |                 let pixel0_f64 = _mm_cvtps_pd(source);
67 |                 sum = _mm_add_pd(sum, _mm_mul_pd(pixel0_f64, coeff0_f64x2));
68 | 
69 |                 let pixel1_f64 = _mm_cvtps_pd(_mm_movehl_ps(source, source));
70 |                 sum = _mm_add_pd(sum, _mm_mul_pd(pixel1_f64, coeff1_f64x2));
71 | 
72 |                 ll_sum[i] = sum;
73 |             }
74 |             x += 2;
75 |         }
76 | 
77 |         if let Some(&k) = coeffs.first() {
78 |             let coeff0_f64x2 = _mm_set1_pd(k);
79 | 
80 |             for i in 0..ROWS_COUNT {
81 |                 let mut sum = ll_sum[i];
82 |                 let pixel = src_rows[i].get_unchecked(x);
83 |                 let source = _mm_set_ps(0., 0., pixel.0[1], pixel.0[0]);
84 | 
85 |                 let pixel0_f64 = _mm_cvtps_pd(source);
86 |                 sum = _mm_add_pd(sum, _mm_mul_pd(pixel0_f64, coeff0_f64x2));
87 | 
88 |                 ll_sum[i] = sum;
89 |             }
90 |         }
91 | 
92 |         for i in 0..ROWS_COUNT {
93 |             _mm_storeu_pd(ll_buf.as_mut_ptr(), ll_sum[i]);
94 |             let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
95 |             dst_pixel.0 = ll_buf.map(|v| v as f32);
96 |         }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/convolution/f32x3/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::vertical_f32::vert_convolution_f32;
 2 | use crate::cpu_extensions::CpuExtensions;
 3 | use crate::pixels::F32x3;
 4 | use crate::{ImageView, ImageViewMut};
 5 | 
 6 | use super::{Coefficients, Convolution};
 7 | 
 8 | #[cfg(target_arch = "x86_64")]
 9 | mod avx2;
10 | mod native;
11 | // #[cfg(target_arch = "aarch64")]
12 | // mod neon;
13 | #[cfg(target_arch = "x86_64")]
14 | mod sse4;
15 | // #[cfg(target_arch = "wasm32")]
16 | // mod wasm32;
17 | 
18 | type P = F32x3;
19 | 
20 | impl Convolution for P {
21 |     fn horiz_convolution(
22 |         src_view: &impl ImageView<Pixel = Self>,
23 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
24 |         offset: u32,
25 |         coeffs: Coefficients,
26 |         cpu_extensions: CpuExtensions,
27 |     ) {
28 |         debug_assert!(src_view.height() - offset >= dst_view.height());
29 |         let coeffs_ref = &coeffs;
30 | 
31 |         try_process_in_threads_h! {
32 |             horiz_convolution(
33 |                 src_view,
34 |                 dst_view,
35 |                 offset,
36 |                 coeffs_ref,
37 |                 cpu_extensions,
38 |             );
39 |         }
40 |     }
41 | 
42 |     fn vert_convolution(
43 |         src_view: &impl ImageView<Pixel = Self>,
44 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
45 |         offset: u32,
46 |         coeffs: Coefficients,
47 |         cpu_extensions: CpuExtensions,
48 |     ) {
49 |         debug_assert!(src_view.width() - offset >= dst_view.width());
50 | 
51 |         let coeffs_ref = &coeffs;
52 | 
53 |         try_process_in_threads_v! {
54 |             vert_convolution(
55 |                 src_view,
56 |                 dst_view,
57 |                 offset,
58 |                 coeffs_ref,
59 |                 cpu_extensions,
60 |             );
61 |         }
62 |     }
63 | }
64 | 
65 | fn horiz_convolution(
66 |     src_view: &impl ImageView<Pixel = P>,
67 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
68 |     offset: u32,
69 |     coeffs: &Coefficients,
70 |     cpu_extensions: CpuExtensions,
71 | ) {
72 |     match cpu_extensions {
73 |         #[cfg(target_arch = "x86_64")]
74 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs),
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs),
77 |         // #[cfg(target_arch = "aarch64")]
78 |         // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs),
79 |         // #[cfg(target_arch = "wasm32")]
80 |         // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs),
81 |         _ => native::horiz_convolution(src_view, dst_view, offset, coeffs),
82 |     }
83 | }
84 | 
85 | fn vert_convolution(
86 |     src_view: &impl ImageView<Pixel = P>,
87 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
88 |     offset: u32,
89 |     coeffs: &Coefficients,
90 |     cpu_extensions: CpuExtensions,
91 | ) {
92 |     vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions);
93 | }
94 | 


--------------------------------------------------------------------------------
/src/convolution/f32x3/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::Coefficients;
 2 | use crate::pixels::F32x3;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn horiz_convolution(
 6 |     src_view: &impl ImageView<Pixel = F32x3>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = F32x3>,
 8 |     offset: u32,
 9 |     coeffs: &Coefficients,
10 | ) {
11 |     let coefficients_chunks = coeffs.get_chunks();
12 |     let src_rows = src_view.iter_rows(offset);
13 |     let dst_rows = dst_view.iter_rows_mut(0);
14 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
15 |         for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) {
16 |             let first_x_src = coeffs_chunk.start as usize;
17 |             let mut ss = [0.; 3];
18 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
19 |             for (&k, &src_pixel) in coeffs_chunk.values.iter().zip(src_pixels) {
20 |                 for (s, c) in ss.iter_mut().zip(src_pixel.0) {
21 |                     *s += c as f64 * k;
22 |                 }
23 |             }
24 |             dst_pixel.0 = ss.map(|v| v as f32);
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/convolution/f32x4/avx2.rs:
--------------------------------------------------------------------------------
 1 | use std::arch::x86_64::*;
 2 | 
 3 | use crate::convolution::{Coefficients, CoefficientsChunk};
 4 | use crate::pixels::F32x4;
 5 | use crate::{simd_utils, ImageView, ImageViewMut};
 6 | 
 7 | #[inline]
 8 | pub(crate) fn horiz_convolution(
 9 |     src_view: &impl ImageView<Pixel = F32x4>,
10 |     dst_view: &mut impl ImageViewMut<Pixel = F32x4>,
11 |     offset: u32,
12 |     coeffs: &Coefficients,
13 | ) {
14 |     let coefficients_chunks = coeffs.get_chunks();
15 |     let dst_height = dst_view.height();
16 | 
17 |     let src_iter = src_view.iter_4_rows(offset, dst_height + offset);
18 |     let dst_iter = dst_view.iter_4_rows_mut();
19 |     for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
20 |         unsafe {
21 |             horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks);
22 |         }
23 |     }
24 | 
25 |     let yy = dst_height - dst_height % 4;
26 |     let src_rows = src_view.iter_rows(yy + offset);
27 |     let dst_rows = dst_view.iter_rows_mut(yy);
28 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
29 |         unsafe {
30 |             horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks);
31 |         }
32 |     }
33 | }
34 | 
35 | /// For safety, it is necessary to ensure the following conditions:
36 | /// - length of all rows in src_rows must be equal
37 | /// - length of all rows in dst_rows must be equal
38 | /// - coefficients_chunks.len() == dst_rows.0.len()
39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
40 | /// - precision <= MAX_COEFS_PRECISION
41 | #[target_feature(enable = "avx2")]
42 | unsafe fn horiz_convolution_rows<const ROWS_COUNT: usize>(
43 |     src_rows: [&[F32x4]; ROWS_COUNT],
44 |     dst_rows: [&mut [F32x4]; ROWS_COUNT],
45 |     coefficients_chunks: &[CoefficientsChunk],
46 | ) {
47 |     for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
48 |         let mut x: usize = coeffs_chunk.start as usize;
49 |         let mut rgba_sums = [_mm256_set1_pd(0.); ROWS_COUNT];
50 | 
51 |         let mut coeffs = coeffs_chunk.values;
52 | 
53 |         let coeffs_by_2 = coeffs.chunks_exact(2);
54 |         coeffs = coeffs_by_2.remainder();
55 |         for k in coeffs_by_2 {
56 |             let coeff0_f64x4 = _mm256_set1_pd(k[0]);
57 |             let coeff1_f64x4 = _mm256_set1_pd(k[1]);
58 | 
59 |             for r in 0..ROWS_COUNT {
60 |                 let pixel01 = simd_utils::loadu_ps256(src_rows[r], x);
61 | 
62 |                 let pixel0_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<0>(pixel01));
63 |                 rgba_sums[r] =
64 |                     _mm256_add_pd(rgba_sums[r], _mm256_mul_pd(pixel0_f64x4, coeff0_f64x4));
65 | 
66 |                 let pixels1_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<1>(pixel01));
67 |                 rgba_sums[r] =
68 |                     _mm256_add_pd(rgba_sums[r], _mm256_mul_pd(pixels1_f64x4, coeff1_f64x4));
69 |             }
70 |             x += 2;
71 |         }
72 | 
73 |         if let Some(&k) = coeffs.first() {
74 |             let coeff0_f64x4 = _mm256_set1_pd(k);
75 | 
76 |             for r in 0..ROWS_COUNT {
77 |                 let pixel0 = simd_utils::loadu_ps(src_rows[r], x);
78 | 
79 |                 let pixel0_f64x4 = _mm256_cvtps_pd(pixel0);
80 |                 rgba_sums[r] =
81 |                     _mm256_add_pd(rgba_sums[r], _mm256_mul_pd(pixel0_f64x4, coeff0_f64x4));
82 |             }
83 |         }
84 | 
85 |         for r in 0..ROWS_COUNT {
86 |             let dst_pixel = dst_rows[r].get_unchecked_mut(dst_x);
87 |             let rgba_f32x4 = _mm256_cvtpd_ps(rgba_sums[r]);
88 |             _mm_storeu_ps(dst_pixel.0.as_mut_ptr(), rgba_f32x4);
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/convolution/f32x4/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::vertical_f32::vert_convolution_f32;
 2 | use crate::cpu_extensions::CpuExtensions;
 3 | use crate::pixels::F32x4;
 4 | use crate::{ImageView, ImageViewMut};
 5 | 
 6 | use super::{Coefficients, Convolution};
 7 | 
 8 | #[cfg(target_arch = "x86_64")]
 9 | mod avx2;
10 | mod native;
11 | // #[cfg(target_arch = "aarch64")]
12 | // mod neon;
13 | #[cfg(target_arch = "x86_64")]
14 | mod sse4;
15 | // #[cfg(target_arch = "wasm32")]
16 | // mod wasm32;
17 | 
18 | type P = F32x4;
19 | 
20 | impl Convolution for P {
21 |     fn horiz_convolution(
22 |         src_view: &impl ImageView<Pixel = Self>,
23 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
24 |         offset: u32,
25 |         coeffs: Coefficients,
26 |         cpu_extensions: CpuExtensions,
27 |     ) {
28 |         debug_assert!(src_view.height() - offset >= dst_view.height());
29 |         let coeffs_ref = &coeffs;
30 | 
31 |         try_process_in_threads_h! {
32 |             horiz_convolution(
33 |                 src_view,
34 |                 dst_view,
35 |                 offset,
36 |                 coeffs_ref,
37 |                 cpu_extensions,
38 |             );
39 |         }
40 |     }
41 | 
42 |     fn vert_convolution(
43 |         src_view: &impl ImageView<Pixel = Self>,
44 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
45 |         offset: u32,
46 |         coeffs: Coefficients,
47 |         cpu_extensions: CpuExtensions,
48 |     ) {
49 |         debug_assert!(src_view.width() - offset >= dst_view.width());
50 | 
51 |         let coeffs_ref = &coeffs;
52 | 
53 |         try_process_in_threads_v! {
54 |             vert_convolution(
55 |                 src_view,
56 |                 dst_view,
57 |                 offset,
58 |                 coeffs_ref,
59 |                 cpu_extensions,
60 |             );
61 |         }
62 |     }
63 | }
64 | 
65 | fn horiz_convolution(
66 |     src_view: &impl ImageView<Pixel = P>,
67 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
68 |     offset: u32,
69 |     coeffs: &Coefficients,
70 |     cpu_extensions: CpuExtensions,
71 | ) {
72 |     match cpu_extensions {
73 |         #[cfg(target_arch = "x86_64")]
74 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, coeffs),
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, coeffs),
77 |         // #[cfg(target_arch = "aarch64")]
78 |         // CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, coeffs),
79 |         // #[cfg(target_arch = "wasm32")]
80 |         // CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, coeffs),
81 |         _ => native::horiz_convolution(src_view, dst_view, offset, coeffs),
82 |     }
83 | }
84 | 
85 | fn vert_convolution(
86 |     src_view: &impl ImageView<Pixel = P>,
87 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
88 |     offset: u32,
89 |     coeffs: &Coefficients,
90 |     cpu_extensions: CpuExtensions,
91 | ) {
92 |     vert_convolution_f32(src_view, dst_view, offset, coeffs, cpu_extensions);
93 | }
94 | 


--------------------------------------------------------------------------------
/src/convolution/f32x4/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::Coefficients;
 2 | use crate::pixels::F32x4;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn horiz_convolution(
 6 |     src_view: &impl ImageView<Pixel = F32x4>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = F32x4>,
 8 |     offset: u32,
 9 |     coeffs: &Coefficients,
10 | ) {
11 |     let coefficients_chunks = coeffs.get_chunks();
12 |     let src_rows = src_view.iter_rows(offset);
13 |     let dst_rows = dst_view.iter_rows_mut(0);
14 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
15 |         for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) {
16 |             let first_x_src = coeffs_chunk.start as usize;
17 |             let mut ss = [0.; 4];
18 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
19 |             for (&k, &src_pixel) in coeffs_chunk.values.iter().zip(src_pixels) {
20 |                 for (s, c) in ss.iter_mut().zip(src_pixel.0) {
21 |                     *s += c as f64 * k;
22 |                 }
23 |             }
24 |             dst_pixel.0 = ss.map(|v| v as f32);
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/convolution/f32x4/sse4.rs:
--------------------------------------------------------------------------------
 1 | use std::arch::x86_64::*;
 2 | 
 3 | use crate::convolution::{Coefficients, CoefficientsChunk};
 4 | use crate::pixels::F32x4;
 5 | use crate::{simd_utils, ImageView, ImageViewMut};
 6 | 
 7 | #[inline]
 8 | pub(crate) fn horiz_convolution(
 9 |     src_view: &impl ImageView<Pixel = F32x4>,
10 |     dst_view: &mut impl ImageViewMut<Pixel = F32x4>,
11 |     offset: u32,
12 |     coeffs: &Coefficients,
13 | ) {
14 |     let coefficients_chunks = coeffs.get_chunks();
15 |     let dst_height = dst_view.height();
16 | 
17 |     let src_iter = src_view.iter_4_rows(offset, dst_height + offset);
18 |     let dst_iter = dst_view.iter_4_rows_mut();
19 |     for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
20 |         unsafe {
21 |             horiz_convolution_rows(src_rows, dst_rows, &coefficients_chunks);
22 |         }
23 |     }
24 | 
25 |     let yy = dst_height - dst_height % 4;
26 |     let src_rows = src_view.iter_rows(yy + offset);
27 |     let dst_rows = dst_view.iter_rows_mut(yy);
28 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
29 |         unsafe {
30 |             horiz_convolution_rows([src_row], [dst_row], &coefficients_chunks);
31 |         }
32 |     }
33 | }
34 | 
35 | /// For safety, it is necessary to ensure the following conditions:
36 | /// - length of all rows in src_rows must be equal
37 | /// - length of all rows in dst_rows must be equal
38 | /// - coefficients_chunks.len() == dst_rows.0.len()
39 | /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
40 | /// - precision <= MAX_COEFS_PRECISION
41 | #[target_feature(enable = "sse4.1")]
42 | unsafe fn horiz_convolution_rows<const ROWS_COUNT: usize>(
43 |     src_rows: [&[F32x4]; ROWS_COUNT],
44 |     dst_rows: [&mut [F32x4]; ROWS_COUNT],
45 |     coefficients_chunks: &[CoefficientsChunk],
46 | ) {
47 |     let mut rg_buf = [0f64; 2];
48 |     let mut ba_buf = [0f64; 2];
49 | 
50 |     for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
51 |         let mut x: usize = coeffs_chunk.start as usize;
52 |         let mut rg_sums = [_mm_set1_pd(0.); ROWS_COUNT];
53 |         let mut ba_sums = [_mm_set1_pd(0.); ROWS_COUNT];
54 | 
55 |         for &k in coeffs_chunk.values {
56 |             let coeffs_f64x2 = _mm_set1_pd(k);
57 | 
58 |             for r in 0..ROWS_COUNT {
59 |                 let pixel = simd_utils::loadu_ps(src_rows[r], x);
60 |                 let rg_f64x2 = _mm_cvtps_pd(pixel);
61 |                 rg_sums[r] = _mm_add_pd(rg_sums[r], _mm_mul_pd(rg_f64x2, coeffs_f64x2));
62 |                 let ba_f64x2 = _mm_cvtps_pd(_mm_movehl_ps(pixel, pixel));
63 |                 ba_sums[r] = _mm_add_pd(ba_sums[r], _mm_mul_pd(ba_f64x2, coeffs_f64x2));
64 |             }
65 |             x += 1;
66 |         }
67 | 
68 |         for i in 0..ROWS_COUNT {
69 |             _mm_storeu_pd(rg_buf.as_mut_ptr(), rg_sums[i]);
70 |             _mm_storeu_pd(ba_buf.as_mut_ptr(), ba_sums[i]);
71 |             let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
72 |             dst_pixel.0 = [
73 |                 rg_buf[0] as f32,
74 |                 rg_buf[1] as f32,
75 |                 ba_buf[0] as f32,
76 |                 ba_buf[1] as f32,
77 |             ];
78 |         }
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/convolution/i32x1/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::pixels::I32;
 3 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 4 | 
 5 | mod native;
 6 | 
 7 | type P = I32;
 8 | 
 9 | impl Convolution for P {
10 |     fn horiz_convolution(
11 |         src_view: &impl ImageView<Pixel = Self>,
12 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
13 |         offset: u32,
14 |         coeffs: Coefficients,
15 |         _cpu_extensions: CpuExtensions,
16 |     ) {
17 |         debug_assert!(src_view.height() - offset >= dst_view.height());
18 |         let coeffs_ref = &coeffs;
19 | 
20 |         try_process_in_threads_h! {
21 |             horiz_convolution(
22 |                 src_view,
23 |                 dst_view,
24 |                 offset,
25 |                 coeffs_ref,
26 |             );
27 |         }
28 |     }
29 | 
30 |     fn vert_convolution(
31 |         src_view: &impl ImageView<Pixel = Self>,
32 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
33 |         offset: u32,
34 |         coeffs: Coefficients,
35 |         _cpu_extensions: CpuExtensions,
36 |     ) {
37 |         debug_assert!(src_view.width() - offset >= dst_view.width());
38 | 
39 |         let coeffs_ref = &coeffs;
40 | 
41 |         try_process_in_threads_v! {
42 |             vert_convolution(
43 |                 src_view,
44 |                 dst_view,
45 |                 offset,
46 |                 coeffs_ref,
47 |             );
48 |         }
49 |     }
50 | }
51 | 
52 | #[inline(always)]
53 | fn horiz_convolution(
54 |     src_view: &impl ImageView<Pixel = P>,
55 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
56 |     offset: u32,
57 |     coefficients: &Coefficients,
58 | ) {
59 |     native::horiz_convolution(src_view, dst_view, offset, coefficients);
60 | }
61 | 
62 | #[inline(always)]
63 | fn vert_convolution(
64 |     src_view: &impl ImageView<Pixel = P>,
65 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
66 |     offset: u32,
67 |     coefficients: &Coefficients,
68 | ) {
69 |     native::vert_convolution(src_view, dst_view, offset, coefficients);
70 | }
71 | 


--------------------------------------------------------------------------------
/src/convolution/i32x1/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::Coefficients;
 2 | use crate::pixels::I32;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn horiz_convolution(
 6 |     src_view: &impl ImageView<Pixel = I32>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = I32>,
 8 |     offset: u32,
 9 |     coeffs: &Coefficients,
10 | ) {
11 |     let coefficients_chunks = coeffs.get_chunks();
12 |     let src_rows = src_view.iter_rows(offset);
13 |     let dst_rows = dst_view.iter_rows_mut(0);
14 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
15 |         for (dst_pixel, coeffs_chunk) in dst_row.iter_mut().zip(&coefficients_chunks) {
16 |             let first_x_src = coeffs_chunk.start as usize;
17 |             let mut ss = 0.;
18 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
19 |             for (&k, &pixel) in coeffs_chunk.values.iter().zip(src_pixels) {
20 |                 ss += pixel.0 as f64 * k;
21 |             }
22 |             dst_pixel.0 = ss.round() as i32;
23 |         }
24 |     }
25 | }
26 | 
27 | pub(crate) fn vert_convolution(
28 |     src_view: &impl ImageView<Pixel = I32>,
29 |     dst_view: &mut impl ImageViewMut<Pixel = I32>,
30 |     offset: u32,
31 |     coeffs: &Coefficients,
32 | ) {
33 |     let coefficients_chunks = coeffs.get_chunks();
34 |     let dst_rows = dst_view.iter_rows_mut(0);
35 |     let start_src_x = offset as usize;
36 |     for (&coeffs_chunk, dst_row) in coefficients_chunks.iter().zip(dst_rows) {
37 |         let first_y_src = coeffs_chunk.start;
38 |         let mut src_x = start_src_x;
39 |         for dst_pixel in dst_row.iter_mut() {
40 |             let mut ss = 0.;
41 |             let src_rows = src_view.iter_rows(first_y_src);
42 |             for (src_row, &k) in src_rows.zip(coeffs_chunk.values) {
43 |                 let src_pixel = unsafe { src_row.get_unchecked(src_x) };
44 |                 ss += src_pixel.0 as f64 * k;
45 |             }
46 |             dst_pixel.0 = ss.round() as i32;
47 |             src_x += 1;
48 |         }
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/convolution/macros.rs:
--------------------------------------------------------------------------------
  1 | macro_rules! constify_imm8 {
  2 |     ($imm8:expr, $expand:ident) => {
  3 |         #[allow(overflowing_literals)]
  4 |         match ($imm8) & 0b0011_1111 {
  5 |             0 => {}
  6 |             1 => $expand!(1),
  7 |             2 => $expand!(2),
  8 |             3 => $expand!(3),
  9 |             4 => $expand!(4),
 10 |             5 => $expand!(5),
 11 |             6 => $expand!(6),
 12 |             7 => $expand!(7),
 13 |             8 => $expand!(8),
 14 |             9 => $expand!(9),
 15 |             10 => $expand!(10),
 16 |             12 => $expand!(12),
 17 |             13 => $expand!(13),
 18 |             14 => $expand!(14),
 19 |             15 => $expand!(15),
 20 |             16 => $expand!(16),
 21 |             17 => $expand!(17),
 22 |             18 => $expand!(18),
 23 |             19 => $expand!(19),
 24 |             20 => $expand!(20),
 25 |             21 => $expand!(21),
 26 |             22 => $expand!(22),
 27 |             23 => $expand!(23),
 28 |             24 => $expand!(24),
 29 |             25 => $expand!(25),
 30 |             26 => $expand!(26),
 31 |             27 => $expand!(27),
 32 |             28 => $expand!(28),
 33 |             29 => $expand!(29),
 34 |             30 => $expand!(30),
 35 |             31 => $expand!(31),
 36 |             _ => unreachable!(),
 37 |         }
 38 |     };
 39 | }
 40 | 
 41 | #[cfg(target_arch = "aarch64")]
 42 | macro_rules! constify_64_imm8 {
 43 |     ($imm8:expr, $expand:ident) => {
 44 |         #[allow(overflowing_literals)]
 45 |         match ($imm8) & 0b0111_1111 {
 46 |             0 => {}
 47 |             1 => $expand!(1),
 48 |             2 => $expand!(2),
 49 |             3 => $expand!(3),
 50 |             4 => $expand!(4),
 51 |             5 => $expand!(5),
 52 |             6 => $expand!(6),
 53 |             7 => $expand!(7),
 54 |             8 => $expand!(8),
 55 |             9 => $expand!(9),
 56 |             10 => $expand!(10),
 57 |             12 => $expand!(12),
 58 |             13 => $expand!(13),
 59 |             14 => $expand!(14),
 60 |             15 => $expand!(15),
 61 |             16 => $expand!(16),
 62 |             17 => $expand!(17),
 63 |             18 => $expand!(18),
 64 |             19 => $expand!(19),
 65 |             20 => $expand!(20),
 66 |             21 => $expand!(21),
 67 |             22 => $expand!(22),
 68 |             23 => $expand!(23),
 69 |             24 => $expand!(24),
 70 |             25 => $expand!(25),
 71 |             26 => $expand!(26),
 72 |             27 => $expand!(27),
 73 |             28 => $expand!(28),
 74 |             29 => $expand!(29),
 75 |             30 => $expand!(30),
 76 |             31 => $expand!(31),
 77 |             32 => $expand!(32),
 78 |             33 => $expand!(33),
 79 |             34 => $expand!(34),
 80 |             35 => $expand!(35),
 81 |             36 => $expand!(36),
 82 |             37 => $expand!(37),
 83 |             38 => $expand!(38),
 84 |             39 => $expand!(39),
 85 |             40 => $expand!(40),
 86 |             41 => $expand!(41),
 87 |             42 => $expand!(42),
 88 |             43 => $expand!(43),
 89 |             44 => $expand!(44),
 90 |             45 => $expand!(45),
 91 |             46 => $expand!(46),
 92 |             47 => $expand!(47),
 93 |             48 => $expand!(48),
 94 |             49 => $expand!(49),
 95 |             50 => $expand!(50),
 96 |             51 => $expand!(51),
 97 |             52 => $expand!(52),
 98 |             53 => $expand!(53),
 99 |             54 => $expand!(54),
100 |             55 => $expand!(55),
101 |             56 => $expand!(56),
102 |             57 => $expand!(57),
103 |             58 => $expand!(58),
104 |             59 => $expand!(59),
105 |             60 => $expand!(60),
106 |             61 => $expand!(61),
107 |             62 => $expand!(62),
108 |             63 => $expand!(63),
109 |             _ => unreachable!(),
110 |         }
111 |     };
112 | }
113 | 


--------------------------------------------------------------------------------
/src/convolution/u16x1/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer32;
 3 | use crate::convolution::vertical_u16::vert_convolution_u16;
 4 | use crate::pixels::U16;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U16;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer32::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer32::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u16(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer32,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u16x1/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer32;
 2 | use crate::pixels::U16;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U16>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U16>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer32,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let coefficients_chunks = normalizer.chunks();
14 |     let initial = 1i64 << (precision - 1);
15 | 
16 |     let src_rows = src_view.iter_rows(offset);
17 |     let dst_rows = dst_view.iter_rows_mut(0);
18 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = initial;
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 ss += src_pixel.0 as i64 * (k as i64);
25 |             }
26 |             dst_pixel.0 = normalizer.clip(ss);
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/convolution/u16x2/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer32;
 3 | use crate::convolution::vertical_u16::vert_convolution_u16;
 4 | use crate::pixels::U16x2;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U16x2;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer32::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer32::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u16(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer32,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u16x2/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer32;
 2 | use crate::pixels::U16x2;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U16x2>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U16x2>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer32,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let coefficients_chunks = normalizer.chunks();
14 |     let initial: i64 = 1 << (precision - 1);
15 | 
16 |     let src_rows = src_view.iter_rows(offset);
17 |     let dst_rows = dst_view.iter_rows_mut(0);
18 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = [initial; 2];
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 for (i, s) in ss.iter_mut().enumerate() {
25 |                     *s += src_pixel.0[i] as i64 * (k as i64);
26 |                 }
27 |             }
28 |             for (i, s) in ss.iter().copied().enumerate() {
29 |                 dst_pixel.0[i] = normalizer.clip(s);
30 |             }
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/convolution/u16x3/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer32;
 3 | use crate::convolution::vertical_u16::vert_convolution_u16;
 4 | use crate::pixels::U16x3;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U16x3;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer32::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer32::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u16(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer32,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u16x3/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer32;
 2 | use crate::pixels::U16x3;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U16x3>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U16x3>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer32,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let coefficients_chunks = normalizer.chunks();
14 |     let initial = 1i64 << (precision - 1);
15 | 
16 |     let src_rows = src_view.iter_rows(offset);
17 |     let dst_rows = dst_view.iter_rows_mut(0);
18 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = [initial; 3];
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 for (s, c) in ss.iter_mut().zip(src_pixel.0) {
25 |                     *s += c as i64 * (k as i64);
26 |                 }
27 |             }
28 |             for (i, s) in ss.iter().copied().enumerate() {
29 |                 dst_pixel.0[i] = normalizer.clip(s);
30 |             }
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/convolution/u16x4/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer32;
 3 | use crate::convolution::vertical_u16::vert_convolution_u16;
 4 | use crate::pixels::U16x4;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U16x4;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer32::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer32::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u16(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer32,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u16x4/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer32;
 2 | use crate::pixels::U16x4;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U16x4>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U16x4>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer32,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let coefficients_chunks = normalizer.chunks();
14 |     let initial: i64 = 1 << (precision - 1);
15 | 
16 |     let src_rows = src_view.iter_rows(offset);
17 |     let dst_rows = dst_view.iter_rows_mut(0);
18 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = [initial; 4];
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 for (i, s) in ss.iter_mut().enumerate() {
25 |                     *s += src_pixel.0[i] as i64 * (k as i64);
26 |                 }
27 |             }
28 |             for (i, s) in ss.iter().copied().enumerate() {
29 |                 dst_pixel.0[i] = normalizer.clip(s);
30 |             }
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/convolution/u8x1/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer16;
 3 | use crate::convolution::vertical_u8::vert_convolution_u8;
 4 | use crate::pixels::U8;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U8;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer16::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer16::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u8(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer16,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u8x1/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer16;
 2 | use crate::pixels::U8;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U8>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U8>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer16,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let initial = 1 << (precision - 1);
14 |     let coefficients = normalizer.chunks();
15 | 
16 |     let src_rows = src_view.iter_rows(offset);
17 |     let dst_rows = dst_view.iter_rows_mut(0);
18 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = initial;
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, &src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 ss += src_pixel.0 as i32 * (k as i32);
25 |             }
26 |             dst_pixel.0 = unsafe { normalizer.clip(ss) };
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/convolution/u8x2/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer16;
 3 | use crate::convolution::vertical_u8::vert_convolution_u8;
 4 | use crate::pixels::U8x2;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U8x2;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer16::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer16::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u8(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer16,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u8x2/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer16;
 2 | use crate::pixels::U8x2;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | pub(crate) fn horiz_convolution(
 6 |     src_view: &impl ImageView<Pixel = U8x2>,
 7 |     dst_view: &mut impl ImageViewMut<Pixel = U8x2>,
 8 |     offset: u32,
 9 |     normalizer: &Normalizer16,
10 | ) {
11 |     let precision = normalizer.precision();
12 |     let coefficients_chunks = normalizer.chunks();
13 |     let initial = 1 << (precision - 1);
14 | 
15 |     let src_rows = src_view.iter_rows(offset);
16 |     let dst_rows = dst_view.iter_rows_mut(0);
17 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
18 |         for (coeffs_chunk, dst_pixel) in coefficients_chunks.iter().zip(dst_row.iter_mut()) {
19 |             let first_x_src = coeffs_chunk.start as usize;
20 |             let ks = coeffs_chunk.values();
21 |             let mut ss = [initial; 2];
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, &src_pixel) in ks.iter().zip(src_pixels) {
24 |                 for (s, c) in ss.iter_mut().zip(src_pixel.0) {
25 |                     *s += c as i32 * (k as i32);
26 |                 }
27 |             }
28 |             dst_pixel.0 = ss.map(|v| unsafe { normalizer.clip(v) });
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/convolution/u8x3/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer16;
 3 | use crate::convolution::vertical_u8::vert_convolution_u8;
 4 | use crate::pixels::U8x3;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U8x3;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer16::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer16::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u8(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer16,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u8x3/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer16;
 2 | use crate::pixels::U8x3;
 3 | use crate::{ImageView, ImageViewMut};
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U8x3>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U8x3>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer16,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let coefficients = normalizer.chunks();
14 |     let initial = 1i32 << (precision - 1);
15 | 
16 |     let src_rows = src_view.iter_rows(offset);
17 |     let dst_rows = dst_view.iter_rows_mut(0);
18 |     for (dst_row, src_row) in dst_rows.zip(src_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = [initial; 3];
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 for (s, c) in ss.iter_mut().zip(src_pixel.0) {
25 |                     *s += c as i32 * (k as i32);
26 |                 }
27 |             }
28 |             dst_pixel.0 = ss.map(|v| unsafe { normalizer.clip(v) });
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/convolution/u8x4/mod.rs:
--------------------------------------------------------------------------------
 1 | use super::{Coefficients, Convolution};
 2 | use crate::convolution::optimisations::Normalizer16;
 3 | use crate::convolution::vertical_u8::vert_convolution_u8;
 4 | use crate::pixels::U8x4;
 5 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 6 | 
 7 | #[cfg(target_arch = "x86_64")]
 8 | mod avx2;
 9 | mod native;
10 | #[cfg(target_arch = "aarch64")]
11 | mod neon;
12 | #[cfg(target_arch = "x86_64")]
13 | mod sse4;
14 | #[cfg(target_arch = "wasm32")]
15 | mod wasm32;
16 | 
17 | type P = U8x4;
18 | 
19 | impl Convolution for P {
20 |     fn horiz_convolution(
21 |         src_view: &impl ImageView<Pixel = Self>,
22 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
23 |         offset: u32,
24 |         coeffs: Coefficients,
25 |         cpu_extensions: CpuExtensions,
26 |     ) {
27 |         debug_assert!(src_view.height() - offset >= dst_view.height());
28 | 
29 |         let normalizer = Normalizer16::new(coeffs);
30 |         let normalizer_ref = &normalizer;
31 | 
32 |         try_process_in_threads_h! {
33 |             horiz_convolution(
34 |                 src_view,
35 |                 dst_view,
36 |                 offset,
37 |                 normalizer_ref,
38 |                 cpu_extensions,
39 |             );
40 |         }
41 |     }
42 | 
43 |     fn vert_convolution(
44 |         src_view: &impl ImageView<Pixel = Self>,
45 |         dst_view: &mut impl ImageViewMut<Pixel = Self>,
46 |         offset: u32,
47 |         coeffs: Coefficients,
48 |         cpu_extensions: CpuExtensions,
49 |     ) {
50 |         debug_assert!(src_view.width() - offset >= dst_view.width());
51 | 
52 |         let normalizer = Normalizer16::new(coeffs);
53 |         let normalizer_ref = &normalizer;
54 | 
55 |         try_process_in_threads_v! {
56 |             vert_convolution_u8(
57 |                 src_view,
58 |                 dst_view,
59 |                 offset,
60 |                 normalizer_ref,
61 |                 cpu_extensions,
62 |             );
63 |         }
64 |     }
65 | }
66 | 
67 | fn horiz_convolution(
68 |     src_view: &impl ImageView<Pixel = P>,
69 |     dst_view: &mut impl ImageViewMut<Pixel = P>,
70 |     offset: u32,
71 |     normalizer: &Normalizer16,
72 |     cpu_extensions: CpuExtensions,
73 | ) {
74 |     match cpu_extensions {
75 |         #[cfg(target_arch = "x86_64")]
76 |         CpuExtensions::Avx2 => avx2::horiz_convolution(src_view, dst_view, offset, normalizer),
77 |         #[cfg(target_arch = "x86_64")]
78 |         CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_view, dst_view, offset, normalizer),
79 |         #[cfg(target_arch = "aarch64")]
80 |         CpuExtensions::Neon => neon::horiz_convolution(src_view, dst_view, offset, normalizer),
81 |         #[cfg(target_arch = "wasm32")]
82 |         CpuExtensions::Simd128 => wasm32::horiz_convolution(src_view, dst_view, offset, normalizer),
83 |         _ => native::horiz_convolution(src_view, dst_view, offset, normalizer),
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/convolution/u8x4/native.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer16;
 2 | use crate::image_view::{ImageView, ImageViewMut};
 3 | use crate::pixels::U8x4;
 4 | 
 5 | #[inline(always)]
 6 | pub(crate) fn horiz_convolution(
 7 |     src_view: &impl ImageView<Pixel = U8x4>,
 8 |     dst_view: &mut impl ImageViewMut<Pixel = U8x4>,
 9 |     offset: u32,
10 |     normalizer: &Normalizer16,
11 | ) {
12 |     let precision = normalizer.precision();
13 |     let initial = 1 << (precision - 1);
14 |     let coefficients = normalizer.chunks();
15 |     let src_rows = src_view.iter_rows(offset);
16 |     let dst_rows = dst_view.iter_rows_mut(0);
17 | 
18 |     for (src_row, dst_row) in src_rows.zip(dst_rows) {
19 |         for (coeffs_chunk, dst_pixel) in coefficients.iter().zip(dst_row.iter_mut()) {
20 |             let first_x_src = coeffs_chunk.start as usize;
21 |             let mut ss = [initial; 4];
22 |             let src_pixels = unsafe { src_row.get_unchecked(first_x_src..) };
23 |             for (&k, &src_pixel) in coeffs_chunk.values().iter().zip(src_pixels) {
24 |                 for (i, s) in ss.iter_mut().enumerate() {
25 |                     *s += src_pixel.0[i] as i32 * (k as i32);
26 |                 }
27 |             }
28 |             dst_pixel.0 = ss.map(|v| unsafe { normalizer.clip(v) });
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_f32/avx2.rs:
--------------------------------------------------------------------------------
  1 | use std::arch::x86_64::*;
  2 | 
  3 | use crate::convolution::{Coefficients, CoefficientsChunk};
  4 | use crate::pixels::InnerPixel;
  5 | use crate::{simd_utils, ImageView, ImageViewMut};
  6 | 
  7 | use super::native;
  8 | 
  9 | pub(crate) fn vert_convolution<T>(
 10 |     src_view: &impl ImageView<Pixel = T>,
 11 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
 12 |     offset: u32,
 13 |     coeffs: &Coefficients,
 14 | ) where
 15 |     T: InnerPixel<Component = f32>,
 16 | {
 17 |     let coefficients_chunks = coeffs.get_chunks();
 18 |     let src_x = offset as usize * T::count_of_components();
 19 | 
 20 |     let dst_rows = dst_view.iter_rows_mut(0);
 21 |     for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) {
 22 |         unsafe {
 23 |             vert_convolution_into_one_row_f32(src_view, dst_row, src_x, coeffs_chunk);
 24 |         }
 25 |     }
 26 | }
 27 | 
 28 | #[target_feature(enable = "avx2")]
 29 | unsafe fn vert_convolution_into_one_row_f32<T: InnerPixel<Component = f32>>(
 30 |     src_view: &impl ImageView<Pixel = T>,
 31 |     dst_row: &mut [T],
 32 |     mut src_x: usize,
 33 |     coeffs_chunk: CoefficientsChunk,
 34 | ) {
 35 |     let mut c_buf = [0f64; 4];
 36 |     let mut dst_f32 = T::components_mut(dst_row);
 37 | 
 38 |     let mut dst_chunks = dst_f32.chunks_exact_mut(32);
 39 |     for dst_chunk in &mut dst_chunks {
 40 |         multiply_components_of_rows::<_, 8>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
 41 |         src_x += 32;
 42 |     }
 43 | 
 44 |     dst_f32 = dst_chunks.into_remainder();
 45 |     dst_chunks = dst_f32.chunks_exact_mut(16);
 46 |     for dst_chunk in &mut dst_chunks {
 47 |         multiply_components_of_rows::<_, 4>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
 48 |         src_x += 16;
 49 |     }
 50 | 
 51 |     dst_f32 = dst_chunks.into_remainder();
 52 |     dst_chunks = dst_f32.chunks_exact_mut(8);
 53 |     for dst_chunk in &mut dst_chunks {
 54 |         multiply_components_of_rows::<_, 2>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
 55 |         src_x += 8;
 56 |     }
 57 | 
 58 |     dst_f32 = dst_chunks.into_remainder();
 59 |     if !dst_f32.is_empty() {
 60 |         let y_start = coeffs_chunk.start;
 61 |         let coeffs = coeffs_chunk.values;
 62 |         native::convolution_by_f32(src_view, dst_f32, src_x, y_start, coeffs);
 63 |     }
 64 | }
 65 | 
 66 | #[inline]
 67 | #[target_feature(enable = "avx2")]
 68 | unsafe fn multiply_components_of_rows<T: InnerPixel<Component = f32>, const SUMS_COUNT: usize>(
 69 |     src_view: &impl ImageView<Pixel = T>,
 70 |     src_x: usize,
 71 |     coeffs_chunk: CoefficientsChunk,
 72 |     dst_chunk: &mut [f32],
 73 |     c_buf: &mut [f64; 4],
 74 | ) {
 75 |     let mut sums = [_mm256_set1_pd(0.); SUMS_COUNT];
 76 |     let y_start = coeffs_chunk.start;
 77 |     let mut coeffs = coeffs_chunk.values;
 78 |     let mut y: u32 = 0;
 79 |     let max_rows = coeffs.len() as u32;
 80 | 
 81 |     let coeffs_2 = coeffs.chunks_exact(2);
 82 |     coeffs = coeffs_2.remainder();
 83 |     for (src_rows, two_coeffs) in src_view.iter_2_rows(y_start, max_rows).zip(coeffs_2) {
 84 |         let src_rows = src_rows.map(|row| T::components(row).get_unchecked(src_x..));
 85 |         for (&coeff, src_row) in two_coeffs.iter().zip(src_rows) {
 86 |             multiply_components_of_row(&mut sums, coeff, src_row);
 87 |         }
 88 |         y += 2;
 89 |     }
 90 | 
 91 |     if let Some(&coeff) = coeffs.first() {
 92 |         if let Some(s_row) = src_view.iter_rows(y_start + y).next() {
 93 |             let src_row = T::components(s_row).get_unchecked(src_x..);
 94 |             multiply_components_of_row(&mut sums, coeff, src_row);
 95 |         }
 96 |     }
 97 | 
 98 |     let mut dst_ptr = dst_chunk.as_mut_ptr();
 99 |     for sum in sums {
100 |         _mm256_storeu_pd(c_buf.as_mut_ptr(), sum);
101 |         for &v in c_buf.iter() {
102 |             *dst_ptr = v as f32;
103 |             dst_ptr = dst_ptr.add(1);
104 |         }
105 |     }
106 | }
107 | 
108 | #[inline]
109 | #[target_feature(enable = "avx2")]
110 | unsafe fn multiply_components_of_row<const SUMS_COUNT: usize>(
111 |     sums: &mut [__m256d; SUMS_COUNT],
112 |     coeff: f64,
113 |     src_row: &[f32],
114 | ) {
115 |     let coeff_f64x4 = _mm256_set1_pd(coeff);
116 |     let mut i = 0;
117 |     while i < SUMS_COUNT {
118 |         let comp07_f32x8 = simd_utils::loadu_ps256(src_row, i * 4);
119 | 
120 |         let comp03_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<0>(comp07_f32x8));
121 |         sums[i] = _mm256_add_pd(sums[i], _mm256_mul_pd(comp03_f64x4, coeff_f64x4));
122 |         i += 1;
123 | 
124 |         let comp47_f64x4 = _mm256_cvtps_pd(_mm256_extractf128_ps::<1>(comp07_f32x8));
125 |         sums[i] = _mm256_add_pd(sums[i], _mm256_mul_pd(comp47_f64x4, coeff_f64x4));
126 |         i += 1;
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_f32/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::Coefficients;
 2 | use crate::pixels::InnerPixel;
 3 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 4 | 
 5 | #[cfg(target_arch = "x86_64")]
 6 | pub(crate) mod avx2;
 7 | pub(crate) mod native;
 8 | // #[cfg(target_arch = "aarch64")]
 9 | // mod neon;
10 | #[cfg(target_arch = "x86_64")]
11 | pub(crate) mod sse4;
12 | // #[cfg(target_arch = "wasm32")]
13 | // pub mod wasm32;
14 | 
15 | pub(crate) fn vert_convolution_f32<T: InnerPixel<Component = f32>>(
16 |     src_view: &impl ImageView<Pixel = T>,
17 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
18 |     offset: u32,
19 |     coeffs: &Coefficients,
20 |     cpu_extensions: CpuExtensions,
21 | ) {
22 |     // Check safety conditions
23 |     debug_assert!(src_view.width() - offset >= dst_view.width());
24 |     debug_assert_eq!(coeffs.bounds.len(), dst_view.height() as usize);
25 | 
26 |     match cpu_extensions {
27 |         #[cfg(target_arch = "x86_64")]
28 |         CpuExtensions::Avx2 => avx2::vert_convolution(src_view, dst_view, offset, coeffs),
29 |         #[cfg(target_arch = "x86_64")]
30 |         CpuExtensions::Sse4_1 => sse4::vert_convolution(src_view, dst_view, offset, coeffs),
31 |         // #[cfg(target_arch = "aarch64")]
32 |         // CpuExtensions::Neon => neon::vert_convolution(src_view, dst_view, offset, coeffs),
33 |         // #[cfg(target_arch = "wasm32")]
34 |         // CpuExtensions::Simd128 => wasm32::vert_convolution(src_view, dst_view, offset, coeffs),
35 |         _ => native::vert_convolution(src_view, dst_view, offset, coeffs),
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_f32/native.rs:
--------------------------------------------------------------------------------
  1 | use crate::convolution::Coefficients;
  2 | use crate::pixels::InnerPixel;
  3 | use crate::utils::foreach_with_pre_reading;
  4 | use crate::{ImageView, ImageViewMut};
  5 | 
  6 | #[inline(always)]
  7 | pub(crate) fn vert_convolution<T>(
  8 |     src_view: &impl ImageView<Pixel = T>,
  9 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
 10 |     offset: u32,
 11 |     coeffs: &Coefficients,
 12 | ) where
 13 |     T: InnerPixel<Component = f32>,
 14 | {
 15 |     let coefficients_chunks = coeffs.get_chunks();
 16 |     let src_x_initial = offset as usize * T::count_of_components();
 17 | 
 18 |     let dst_rows = dst_view.iter_rows_mut(0);
 19 |     let coeffs_chunks_iter = coefficients_chunks.into_iter();
 20 |     for (coeffs_chunk, dst_row) in coeffs_chunks_iter.zip(dst_rows) {
 21 |         let first_y_src = coeffs_chunk.start;
 22 |         let ks = coeffs_chunk.values;
 23 |         let mut dst_components = T::components_mut(dst_row);
 24 |         let mut x_src = src_x_initial;
 25 | 
 26 |         #[cfg(target_arch = "aarch64")]
 27 |         {
 28 |             (dst_components, x_src) =
 29 |                 convolution_by_chunks::<_, 16>(src_view, dst_components, x_src, first_y_src, ks);
 30 |         }
 31 | 
 32 |         #[cfg(not(target_arch = "wasm32"))]
 33 |         {
 34 |             if !dst_components.is_empty() {
 35 |                 (dst_components, x_src) =
 36 |                     convolution_by_chunks::<_, 8>(src_view, dst_components, x_src, first_y_src, ks);
 37 |             }
 38 |         }
 39 | 
 40 |         #[cfg(target_arch = "wasm32")]
 41 |         {
 42 |             if !dst_components.is_empty() {
 43 |                 (dst_components, x_src) =
 44 |                     crate::convolution::vertical_f32::native::convolution_by_chunks::<_, 4>(
 45 |                         src_view,
 46 |                         dst_components,
 47 |                         x_src,
 48 |                         first_y_src,
 49 |                         ks,
 50 |                     );
 51 |             }
 52 |         }
 53 | 
 54 |         if !dst_components.is_empty() {
 55 |             convolution_by_f32(src_view, dst_components, x_src, first_y_src, ks);
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | #[inline(always)]
 61 | pub(crate) fn convolution_by_f32<T: InnerPixel<Component = f32>>(
 62 |     src_view: &impl ImageView<Pixel = T>,
 63 |     dst_components: &mut [f32],
 64 |     mut x_src: usize,
 65 |     first_y_src: u32,
 66 |     ks: &[f64],
 67 | ) -> usize {
 68 |     for dst_component in dst_components.iter_mut() {
 69 |         let mut ss = 0.;
 70 |         let src_rows = src_view.iter_rows(first_y_src);
 71 |         for (&k, src_row) in ks.iter().zip(src_rows) {
 72 |             // SAFETY: Alignment of src_row is greater or equal than alignment f32
 73 |             //         because a component of pixel type T is f32.
 74 |             let src_ptr = src_row.as_ptr() as *const f32;
 75 |             let src_component = unsafe { *src_ptr.add(x_src) };
 76 |             ss += src_component as f64 * k;
 77 |         }
 78 |         *dst_component = ss as f32;
 79 |         x_src += 1
 80 |     }
 81 |     x_src
 82 | }
 83 | 
 84 | #[inline(always)]
 85 | fn convolution_by_chunks<'a, T, const CHUNK_SIZE: usize>(
 86 |     src_view: &impl ImageView<Pixel = T>,
 87 |     dst_components: &'a mut [f32],
 88 |     mut x_src: usize,
 89 |     first_y_src: u32,
 90 |     ks: &[f64],
 91 | ) -> (&'a mut [f32], usize)
 92 | where
 93 |     T: InnerPixel<Component = f32>,
 94 | {
 95 |     let mut dst_chunks = dst_components.chunks_exact_mut(CHUNK_SIZE);
 96 | 
 97 |     for dst_chunk in &mut dst_chunks {
 98 |         let mut ss = [0.; CHUNK_SIZE];
 99 |         let src_rows = src_view.iter_rows(first_y_src);
100 | 
101 |         foreach_with_pre_reading(
102 |             ks.iter().zip(src_rows),
103 |             |(&k, src_row)| {
104 |                 let src_ptr = src_row.as_ptr() as *const f32;
105 |                 let src_chunk = unsafe {
106 |                     let ptr = src_ptr.add(x_src) as *const [f32; CHUNK_SIZE];
107 |                     ptr.read_unaligned()
108 |                 };
109 |                 (src_chunk, k)
110 |             },
111 |             |(src_chunk, k)| {
112 |                 for (s, c) in ss.iter_mut().zip(src_chunk) {
113 |                     *s += c as f64 * k;
114 |                 }
115 |             },
116 |         );
117 | 
118 |         for (i, s) in ss.iter().copied().enumerate() {
119 |             dst_chunk[i] = s as f32;
120 |         }
121 |         x_src += CHUNK_SIZE;
122 |     }
123 | 
124 |     (dst_chunks.into_remainder(), x_src)
125 | }
126 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_f32/sse4.rs:
--------------------------------------------------------------------------------
  1 | use std::arch::x86_64::*;
  2 | 
  3 | use crate::convolution::{Coefficients, CoefficientsChunk};
  4 | use crate::pixels::InnerPixel;
  5 | use crate::{simd_utils, ImageView, ImageViewMut};
  6 | 
  7 | use super::native;
  8 | 
  9 | pub(crate) fn vert_convolution<T>(
 10 |     src_view: &impl ImageView<Pixel = T>,
 11 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
 12 |     offset: u32,
 13 |     coeffs: &Coefficients,
 14 | ) where
 15 |     T: InnerPixel<Component = f32>,
 16 | {
 17 |     let coefficients_chunks = coeffs.get_chunks();
 18 |     let src_x = offset as usize * T::count_of_components();
 19 | 
 20 |     let dst_rows = dst_view.iter_rows_mut(0);
 21 |     for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) {
 22 |         unsafe {
 23 |             vert_convolution_into_one_row_f32(src_view, dst_row, src_x, coeffs_chunk);
 24 |         }
 25 |     }
 26 | }
 27 | 
 28 | #[target_feature(enable = "sse4.1")]
 29 | unsafe fn vert_convolution_into_one_row_f32<T: InnerPixel<Component = f32>>(
 30 |     src_view: &impl ImageView<Pixel = T>,
 31 |     dst_row: &mut [T],
 32 |     mut src_x: usize,
 33 |     coeffs_chunk: CoefficientsChunk,
 34 | ) {
 35 |     let mut c_buf = [0f64; 2];
 36 |     let mut dst_f32 = T::components_mut(dst_row);
 37 | 
 38 |     let mut dst_chunks = dst_f32.chunks_exact_mut(16);
 39 |     for dst_chunk in &mut dst_chunks {
 40 |         multiply_components_of_rows::<_, 8>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
 41 |         src_x += 16;
 42 |     }
 43 | 
 44 |     dst_f32 = dst_chunks.into_remainder();
 45 |     dst_chunks = dst_f32.chunks_exact_mut(8);
 46 |     for dst_chunk in &mut dst_chunks {
 47 |         multiply_components_of_rows::<_, 4>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
 48 |         src_x += 8;
 49 |     }
 50 | 
 51 |     dst_f32 = dst_chunks.into_remainder();
 52 |     dst_chunks = dst_f32.chunks_exact_mut(4);
 53 |     if let Some(dst_chunk) = dst_chunks.next() {
 54 |         multiply_components_of_rows::<_, 2>(src_view, src_x, coeffs_chunk, dst_chunk, &mut c_buf);
 55 |         src_x += 4;
 56 |     }
 57 | 
 58 |     dst_f32 = dst_chunks.into_remainder();
 59 |     if !dst_f32.is_empty() {
 60 |         let y_start = coeffs_chunk.start;
 61 |         let coeffs = coeffs_chunk.values;
 62 |         native::convolution_by_f32(src_view, dst_f32, src_x, y_start, coeffs);
 63 |     }
 64 | }
 65 | 
 66 | #[inline]
 67 | #[target_feature(enable = "sse4.1")]
 68 | pub(crate) unsafe fn multiply_components_of_rows<
 69 |     T: InnerPixel<Component = f32>,
 70 |     const SUMS_COUNT: usize,
 71 | >(
 72 |     src_view: &impl ImageView<Pixel = T>,
 73 |     src_x: usize,
 74 |     coeffs_chunk: CoefficientsChunk,
 75 |     dst_chunk: &mut [f32],
 76 |     c_buf: &mut [f64; 2],
 77 | ) {
 78 |     let mut sums = [_mm_set1_pd(0.); SUMS_COUNT];
 79 |     let y_start = coeffs_chunk.start;
 80 |     let mut coeffs = coeffs_chunk.values;
 81 |     let mut y: u32 = 0;
 82 |     let max_rows = coeffs.len() as u32;
 83 | 
 84 |     let coeffs_2 = coeffs.chunks_exact(2);
 85 |     coeffs = coeffs_2.remainder();
 86 |     for (src_rows, two_coeffs) in src_view.iter_2_rows(y_start, max_rows).zip(coeffs_2) {
 87 |         let src_rows = src_rows.map(|row| T::components(row).get_unchecked(src_x..));
 88 |         for (&coeff, src_row) in two_coeffs.iter().zip(src_rows) {
 89 |             multiply_components_of_row(&mut sums, coeff, src_row);
 90 |         }
 91 |         y += 2;
 92 |     }
 93 | 
 94 |     if let Some(&coeff) = coeffs.first() {
 95 |         if let Some(s_row) = src_view.iter_rows(y_start + y).next() {
 96 |             let src_row = T::components(s_row).get_unchecked(src_x..);
 97 |             multiply_components_of_row(&mut sums, coeff, src_row);
 98 |         }
 99 |     }
100 | 
101 |     let mut dst_ptr = dst_chunk.as_mut_ptr();
102 |     for sum in sums {
103 |         _mm_storeu_pd(c_buf.as_mut_ptr(), sum);
104 |         for &v in c_buf.iter() {
105 |             *dst_ptr = v as f32;
106 |             dst_ptr = dst_ptr.add(1);
107 |         }
108 |     }
109 | }
110 | 
111 | #[inline]
112 | #[target_feature(enable = "sse4.1")]
113 | unsafe fn multiply_components_of_row<const SUMS_COUNT: usize>(
114 |     sums: &mut [__m128d; SUMS_COUNT],
115 |     coeff: f64,
116 |     src_row: &[f32],
117 | ) {
118 |     let coeff_f64x2 = _mm_set1_pd(coeff);
119 |     let mut i = 0;
120 |     while i < SUMS_COUNT {
121 |         let comp03_f32x4 = simd_utils::loadu_ps(src_row, i * 2);
122 | 
123 |         let comp01_f64x2 = _mm_cvtps_pd(comp03_f32x4);
124 |         sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(comp01_f64x2, coeff_f64x2));
125 |         i += 1;
126 | 
127 |         let comp23_f64x2 = _mm_cvtps_pd(_mm_movehl_ps(comp03_f32x4, comp03_f32x4));
128 |         sums[i] = _mm_add_pd(sums[i], _mm_mul_pd(comp23_f64x2, coeff_f64x2));
129 |         i += 1;
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_u16/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer32;
 2 | use crate::pixels::InnerPixel;
 3 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 4 | 
 5 | #[cfg(target_arch = "x86_64")]
 6 | pub(crate) mod avx2;
 7 | pub(crate) mod native;
 8 | #[cfg(target_arch = "aarch64")]
 9 | mod neon;
10 | #[cfg(target_arch = "x86_64")]
11 | pub(crate) mod sse4;
12 | #[cfg(target_arch = "wasm32")]
13 | pub mod wasm32;
14 | 
15 | pub(crate) fn vert_convolution_u16<T: InnerPixel<Component = u16>>(
16 |     src_view: &impl ImageView<Pixel = T>,
17 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
18 |     offset: u32,
19 |     normalizer: &Normalizer32,
20 |     cpu_extensions: CpuExtensions,
21 | ) {
22 |     // Check safety conditions
23 |     debug_assert!(src_view.width() - offset >= dst_view.width());
24 |     debug_assert_eq!(normalizer.chunks_len(), dst_view.height() as usize);
25 | 
26 |     match cpu_extensions {
27 |         #[cfg(target_arch = "x86_64")]
28 |         CpuExtensions::Avx2 => avx2::vert_convolution(src_view, dst_view, offset, normalizer),
29 |         #[cfg(target_arch = "x86_64")]
30 |         CpuExtensions::Sse4_1 => sse4::vert_convolution(src_view, dst_view, offset, normalizer),
31 |         #[cfg(target_arch = "aarch64")]
32 |         CpuExtensions::Neon => neon::vert_convolution(src_view, dst_view, offset, normalizer),
33 |         #[cfg(target_arch = "wasm32")]
34 |         CpuExtensions::Simd128 => wasm32::vert_convolution(src_view, dst_view, offset, normalizer),
35 |         _ => native::vert_convolution(src_view, dst_view, offset, normalizer),
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_u16/native.rs:
--------------------------------------------------------------------------------
  1 | use crate::convolution::optimisations::Normalizer32;
  2 | use crate::pixels::InnerPixel;
  3 | use crate::utils::foreach_with_pre_reading;
  4 | use crate::{ImageView, ImageViewMut};
  5 | 
  6 | #[inline(always)]
  7 | pub(crate) fn vert_convolution<T>(
  8 |     src_view: &impl ImageView<Pixel = T>,
  9 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
 10 |     offset: u32,
 11 |     normalizer: &Normalizer32,
 12 | ) where
 13 |     T: InnerPixel<Component = u16>,
 14 | {
 15 |     let coefficients_chunks = normalizer.chunks();
 16 |     let precision = normalizer.precision();
 17 |     let initial: i64 = 1 << (precision - 1);
 18 |     let src_x_initial = offset as usize * T::count_of_components();
 19 | 
 20 |     let dst_rows = dst_view.iter_rows_mut(0);
 21 |     let coeffs_chunks_iter = coefficients_chunks.iter();
 22 |     for (coeffs_chunk, dst_row) in coeffs_chunks_iter.zip(dst_rows) {
 23 |         let first_y_src = coeffs_chunk.start;
 24 |         let ks = coeffs_chunk.values();
 25 |         let dst_components = T::components_mut(dst_row);
 26 |         let mut x_src = src_x_initial;
 27 | 
 28 |         let (_, dst_chunks, tail) = unsafe { dst_components.align_to_mut::<[u16; 16]>() };
 29 |         x_src = convolution_by_chunks(
 30 |             src_view,
 31 |             normalizer,
 32 |             initial,
 33 |             dst_chunks,
 34 |             x_src,
 35 |             first_y_src,
 36 |             ks,
 37 |         );
 38 | 
 39 |         if !tail.is_empty() {
 40 |             convolution_by_u16(src_view, normalizer, initial, tail, x_src, first_y_src, ks);
 41 |         }
 42 |     }
 43 | }
 44 | 
 45 | #[inline(always)]
 46 | pub(crate) fn convolution_by_u16<T: InnerPixel<Component = u16>>(
 47 |     src_view: &impl ImageView<Pixel = T>,
 48 |     normalizer: &Normalizer32,
 49 |     initial: i64,
 50 |     dst_components: &mut [u16],
 51 |     mut x_src: usize,
 52 |     first_y_src: u32,
 53 |     ks: &[i32],
 54 | ) -> usize {
 55 |     for dst_component in dst_components.iter_mut() {
 56 |         let mut ss = initial;
 57 |         let src_rows = src_view.iter_rows(first_y_src);
 58 |         for (&k, src_row) in ks.iter().zip(src_rows) {
 59 |             // SAFETY: Alignment of src_row is greater or equal than alignment u16
 60 |             //         because one component of pixel type T is u16.
 61 |             let src_ptr = src_row.as_ptr() as *const u16;
 62 |             let src_component = unsafe { *src_ptr.add(x_src) };
 63 |             ss += src_component as i64 * (k as i64);
 64 |         }
 65 |         *dst_component = normalizer.clip(ss);
 66 |         x_src += 1
 67 |     }
 68 |     x_src
 69 | }
 70 | 
 71 | #[inline(always)]
 72 | fn convolution_by_chunks<T, const CHUNK_SIZE: usize>(
 73 |     src_view: &impl ImageView<Pixel = T>,
 74 |     normalizer: &Normalizer32,
 75 |     initial: i64,
 76 |     dst_chunks: &mut [[u16; CHUNK_SIZE]],
 77 |     mut x_src: usize,
 78 |     first_y_src: u32,
 79 |     ks: &[i32],
 80 | ) -> usize
 81 | where
 82 |     T: InnerPixel<Component = u16>,
 83 | {
 84 |     for dst_chunk in dst_chunks {
 85 |         let mut ss = [initial; CHUNK_SIZE];
 86 |         let src_rows = src_view.iter_rows(first_y_src);
 87 | 
 88 |         foreach_with_pre_reading(
 89 |             ks.iter().zip(src_rows),
 90 |             |(&k, src_row)| {
 91 |                 let src_ptr = src_row.as_ptr() as *const u16;
 92 |                 let src_chunk = unsafe {
 93 |                     let ptr = src_ptr.add(x_src) as *const [u16; CHUNK_SIZE];
 94 |                     ptr.read_unaligned()
 95 |                 };
 96 |                 (src_chunk, k)
 97 |             },
 98 |             |(src_chunk, k)| {
 99 |                 for (s, c) in ss.iter_mut().zip(src_chunk) {
100 |                     *s += c as i64 * (k as i64);
101 |                 }
102 |             },
103 |         );
104 | 
105 |         for (i, s) in ss.iter().copied().enumerate() {
106 |             dst_chunk[i] = normalizer.clip(s);
107 |         }
108 |         x_src += CHUNK_SIZE;
109 |     }
110 |     x_src
111 | }
112 | 


--------------------------------------------------------------------------------
/src/convolution/vertical_u8/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::convolution::optimisations::Normalizer16;
 2 | use crate::pixels::InnerPixel;
 3 | use crate::{CpuExtensions, ImageView, ImageViewMut};
 4 | 
 5 | #[cfg(target_arch = "x86_64")]
 6 | pub(crate) mod avx2;
 7 | pub(crate) mod native;
 8 | #[cfg(target_arch = "aarch64")]
 9 | mod neon;
10 | #[cfg(target_arch = "x86_64")]
11 | pub(crate) mod sse4;
12 | #[cfg(target_arch = "wasm32")]
13 | pub(crate) mod wasm32;
14 | 
15 | pub(crate) fn vert_convolution_u8<T: InnerPixel<Component = u8>>(
16 |     src_view: &impl ImageView<Pixel = T>,
17 |     dst_view: &mut impl ImageViewMut<Pixel = T>,
18 |     offset: u32,
19 |     normalizer: &Normalizer16,
20 |     cpu_extensions: CpuExtensions,
21 | ) {
22 |     // Check safety conditions
23 |     debug_assert!(src_view.width() - offset >= dst_view.width());
24 |     debug_assert_eq!(normalizer.chunks_len(), dst_view.height() as usize);
25 | 
26 |     match cpu_extensions {
27 |         #[cfg(target_arch = "x86_64")]
28 |         CpuExtensions::Avx2 => avx2::vert_convolution(src_view, dst_view, offset, normalizer),
29 |         #[cfg(target_arch = "x86_64")]
30 |         CpuExtensions::Sse4_1 => sse4::vert_convolution(src_view, dst_view, offset, normalizer),
31 |         #[cfg(target_arch = "aarch64")]
32 |         CpuExtensions::Neon => neon::vert_convolution(src_view, dst_view, offset, normalizer),
33 |         #[cfg(target_arch = "wasm32")]
34 |         CpuExtensions::Simd128 => wasm32::vert_convolution(src_view, dst_view, offset, normalizer),
35 |         _ => native::vert_convolution(src_view, dst_view, offset, normalizer),
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/cpu_extensions.rs:
--------------------------------------------------------------------------------
 1 | /// SIMD extension of CPU.
 2 | /// Specific variants depend on target architecture.
 3 | /// Look at source code to see all available variants.
 4 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 5 | pub enum CpuExtensions {
 6 |     None,
 7 |     #[cfg(target_arch = "x86_64")]
 8 |     /// SIMD extension of x86_64 architecture
 9 |     Sse4_1,
10 |     #[cfg(target_arch = "x86_64")]
11 |     /// SIMD extension of x86_64 architecture
12 |     Avx2,
13 |     #[cfg(target_arch = "aarch64")]
14 |     /// SIMD extension of Arm64 architecture
15 |     Neon,
16 |     #[cfg(target_arch = "wasm32")]
17 |     /// SIMD extension of Wasm32 architecture
18 |     Simd128,
19 | }
20 | 
21 | impl CpuExtensions {
22 |     /// Returns `true` if your CPU support the extension.
23 |     pub fn is_supported(&self) -> bool {
24 |         match self {
25 |             #[cfg(target_arch = "x86_64")]
26 |             Self::Avx2 => is_x86_feature_detected!("avx2"),
27 |             #[cfg(target_arch = "x86_64")]
28 |             Self::Sse4_1 => is_x86_feature_detected!("sse4.1"),
29 |             #[cfg(target_arch = "aarch64")]
30 |             Self::Neon => std::arch::is_aarch64_feature_detected!("neon"),
31 |             #[cfg(target_arch = "wasm32")]
32 |             Self::Simd128 => true,
33 |             Self::None => true,
34 |         }
35 |     }
36 | }
37 | 
38 | impl Default for CpuExtensions {
39 |     #[cfg(target_arch = "x86_64")]
40 |     fn default() -> Self {
41 |         if is_x86_feature_detected!("avx2") {
42 |             Self::Avx2
43 |         } else if is_x86_feature_detected!("sse4.1") {
44 |             Self::Sse4_1
45 |         } else {
46 |             Self::None
47 |         }
48 |     }
49 | 
50 |     #[cfg(target_arch = "aarch64")]
51 |     fn default() -> Self {
52 |         if std::arch::is_aarch64_feature_detected!("neon") {
53 |             Self::Neon
54 |         } else {
55 |             Self::None
56 |         }
57 |     }
58 |     #[cfg(target_arch = "wasm32")]
59 |     fn default() -> Self {
60 |         Self::Simd128
61 |     }
62 | 
63 |     #[cfg(not(any(
64 |         target_arch = "x86_64",
65 |         target_arch = "aarch64",
66 |         target_arch = "wasm32"
67 |     )))]
68 |     fn default() -> Self {
69 |         Self::None
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/errors.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)]
 4 | #[non_exhaustive]
 5 | pub enum ImageError {
 6 |     #[error("Pixel type of image is not supported")]
 7 |     UnsupportedPixelType,
 8 | }
 9 | 
10 | #[derive(Error, Debug, Clone, Copy)]
11 | #[error("Size of container with pixels is smaller than required")]
12 | pub struct InvalidPixelsSize;
13 | 
14 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)]
15 | pub enum ImageBufferError {
16 |     #[error("Size of buffer is smaller than required")]
17 |     InvalidBufferSize,
18 |     #[error("Alignment of buffer don't match to alignment of required pixel type")]
19 |     InvalidBufferAlignment,
20 | }
21 | 
22 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)]
23 | pub enum CropBoxError {
24 |     #[error("Position of the crop box is out of the image boundaries")]
25 |     PositionIsOutOfImageBoundaries,
26 |     #[error("Size of the crop box is out of the image boundaries")]
27 |     SizeIsOutOfImageBoundaries,
28 |     #[error("Width or height of the crop box is less than zero")]
29 |     WidthOrHeightLessThanZero,
30 | }
31 | 
32 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)]
33 | #[non_exhaustive]
34 | pub enum ResizeError {
35 |     #[error("Source or destination image is not supported")]
36 |     ImageError(#[from] ImageError),
37 |     #[error("Pixel type of source image does not match to destination image")]
38 |     PixelTypesAreDifferent,
39 |     #[error("Source cropping option is invalid: {0}")]
40 |     SrcCroppingError(#[from] CropBoxError),
41 | }
42 | 
43 | #[derive(Error, Debug, Clone, Copy)]
44 | #[error(
45 |     "The dimensions of the source image are not equal to the dimensions of the destination image"
46 | )]
47 | pub struct DifferentDimensionsError;
48 | 
49 | #[derive(Error, Debug, Clone, Copy, PartialEq, Eq)]
50 | pub enum MappingError {
51 |     #[error("Source or destination image is not supported")]
52 |     ImageError(#[from] ImageError),
53 |     #[error("The dimensions of the source image are not equal to the dimensions of the destination image")]
54 |     DifferentDimensions,
55 |     #[error("Unsupported combination of pixels of source and/or destination images")]
56 |     UnsupportedCombinationOfImageTypes,
57 | }
58 | 
59 | impl From<DifferentDimensionsError> for MappingError {
60 |     fn from(_: DifferentDimensionsError) -> Self {
61 |         MappingError::DifferentDimensions
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/images/cropped_image.rs:
--------------------------------------------------------------------------------
  1 | use crate::images::{check_crop_box, TypedCroppedImage, TypedCroppedImageMut};
  2 | use crate::{
  3 |     CropBoxError, ImageView, ImageViewMut, IntoImageView, IntoImageViewMut, PixelTrait, PixelType,
  4 | };
  5 | 
  6 | /// It is a wrapper that provides [IntoImageView] for part of wrapped image.
  7 | pub struct CroppedImage<'a, V: IntoImageView> {
  8 |     image: &'a V,
  9 |     left: u32,
 10 |     top: u32,
 11 |     width: u32,
 12 |     height: u32,
 13 | }
 14 | 
 15 | /// It is a wrapper that provides [IntoImageView] and [IntoImageViewMut] for part of wrapped image.
 16 | pub struct CroppedImageMut<'a, V: IntoImageView> {
 17 |     image: &'a mut V,
 18 |     left: u32,
 19 |     top: u32,
 20 |     width: u32,
 21 |     height: u32,
 22 | }
 23 | 
 24 | impl<'a, V: IntoImageView> CroppedImage<'a, V> {
 25 |     pub fn new(
 26 |         image: &'a V,
 27 |         left: u32,
 28 |         top: u32,
 29 |         width: u32,
 30 |         height: u32,
 31 |     ) -> Result<Self, CropBoxError> {
 32 |         check_crop_box(image.width(), image.height(), left, top, width, height)?;
 33 |         Ok(Self {
 34 |             image,
 35 |             left,
 36 |             top,
 37 |             width,
 38 |             height,
 39 |         })
 40 |     }
 41 | }
 42 | 
 43 | impl<'a, V: IntoImageView> CroppedImageMut<'a, V> {
 44 |     pub fn new(
 45 |         image: &'a mut V,
 46 |         left: u32,
 47 |         top: u32,
 48 |         width: u32,
 49 |         height: u32,
 50 |     ) -> Result<Self, CropBoxError> {
 51 |         check_crop_box(image.width(), image.height(), left, top, width, height)?;
 52 |         Ok(Self {
 53 |             image,
 54 |             left,
 55 |             top,
 56 |             width,
 57 |             height,
 58 |         })
 59 |     }
 60 | }
 61 | 
 62 | impl<'a, V: IntoImageView> IntoImageView for CroppedImage<'a, V> {
 63 |     fn pixel_type(&self) -> Option<PixelType> {
 64 |         self.image.pixel_type()
 65 |     }
 66 | 
 67 |     fn width(&self) -> u32 {
 68 |         self.width
 69 |     }
 70 | 
 71 |     fn height(&self) -> u32 {
 72 |         self.height
 73 |     }
 74 | 
 75 |     fn image_view<P: PixelTrait>(&self) -> Option<impl ImageView<Pixel = P>> {
 76 |         self.image.image_view().map(|v| {
 77 |             TypedCroppedImage::new(v, self.left, self.top, self.width, self.height).unwrap()
 78 |         })
 79 |     }
 80 | }
 81 | 
 82 | impl<'a, V: IntoImageView> IntoImageView for CroppedImageMut<'a, V> {
 83 |     fn pixel_type(&self) -> Option<PixelType> {
 84 |         self.image.pixel_type()
 85 |     }
 86 | 
 87 |     fn width(&self) -> u32 {
 88 |         self.width
 89 |     }
 90 | 
 91 |     fn height(&self) -> u32 {
 92 |         self.height
 93 |     }
 94 | 
 95 |     fn image_view<P: PixelTrait>(&self) -> Option<impl ImageView<Pixel = P>> {
 96 |         self.image.image_view().map(|v| {
 97 |             TypedCroppedImage::new(v, self.left, self.top, self.width, self.height).unwrap()
 98 |         })
 99 |     }
100 | }
101 | 
102 | impl<'a, V: IntoImageViewMut> IntoImageViewMut for CroppedImageMut<'a, V> {
103 |     fn image_view_mut<P: PixelTrait>(&mut self) -> Option<impl ImageViewMut<Pixel = P>> {
104 |         self.image.image_view_mut().map(|v| {
105 |             TypedCroppedImageMut::new(v, self.left, self.top, self.width, self.height).unwrap()
106 |         })
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/src/images/image_crate.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::DerefMut;
 2 | 
 3 | use crate::image_view::try_pixel_type;
 4 | use crate::images::{TypedImage, TypedImageRef};
 5 | use crate::{ImageView, ImageViewMut, IntoImageView, IntoImageViewMut, PixelTrait, PixelType};
 6 | use bytemuck::cast_slice_mut;
 7 | use image::DynamicImage;
 8 | 
 9 | impl IntoImageView for DynamicImage {
10 |     fn pixel_type(&self) -> Option<PixelType> {
11 |         match self {
12 |             DynamicImage::ImageLuma8(_) => Some(PixelType::U8),
13 |             DynamicImage::ImageLumaA8(_) => Some(PixelType::U8x2),
14 |             DynamicImage::ImageRgb8(_) => Some(PixelType::U8x3),
15 |             DynamicImage::ImageRgba8(_) => Some(PixelType::U8x4),
16 |             DynamicImage::ImageLuma16(_) => Some(PixelType::U16),
17 |             DynamicImage::ImageLumaA16(_) => Some(PixelType::U16x2),
18 |             DynamicImage::ImageRgb16(_) => Some(PixelType::U16x3),
19 |             DynamicImage::ImageRgba16(_) => Some(PixelType::U16x4),
20 |             _ => None,
21 |         }
22 |     }
23 | 
24 |     fn width(&self) -> u32 {
25 |         self.width()
26 |     }
27 | 
28 |     fn height(&self) -> u32 {
29 |         self.height()
30 |     }
31 | 
32 |     fn image_view<P: PixelTrait>(&self) -> Option<impl ImageView<Pixel = P>> {
33 |         if let Ok(pixel_type) = try_pixel_type(self) {
34 |             if P::pixel_type() == pixel_type {
35 |                 return TypedImageRef::<P>::from_buffer(
36 |                     self.width(),
37 |                     self.height(),
38 |                     self.as_bytes(),
39 |                 )
40 |                 .ok();
41 |             }
42 |         }
43 |         None
44 |     }
45 | }
46 | 
47 | impl IntoImageViewMut for DynamicImage {
48 |     fn image_view_mut<P: PixelTrait>(&mut self) -> Option<impl ImageViewMut<Pixel = P>> {
49 |         if let Ok(pixel_type) = try_pixel_type(self) {
50 |             if P::pixel_type() == pixel_type {
51 |                 return TypedImage::<P>::from_buffer(
52 |                     self.width(),
53 |                     self.height(),
54 |                     image_as_bytes_mut(self),
55 |                 )
56 |                 .ok();
57 |             }
58 |         }
59 |         None
60 |     }
61 | }
62 | 
63 | fn image_as_bytes_mut(image: &mut DynamicImage) -> &mut [u8] {
64 |     match image {
65 |         DynamicImage::ImageLuma8(img) => (*img).deref_mut(),
66 |         DynamicImage::ImageLumaA8(img) => (*img).deref_mut(),
67 |         DynamicImage::ImageRgb8(img) => (*img).deref_mut(),
68 |         DynamicImage::ImageRgba8(img) => (*img).deref_mut(),
69 |         DynamicImage::ImageLuma16(img) => cast_slice_mut((*img).deref_mut()),
70 |         DynamicImage::ImageLumaA16(img) => cast_slice_mut((*img).deref_mut()),
71 |         DynamicImage::ImageRgb16(img) => cast_slice_mut((*img).deref_mut()),
72 |         DynamicImage::ImageRgba16(img) => cast_slice_mut((*img).deref_mut()),
73 |         _ => &mut [],
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/images/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Contains different types of images and wrappers for them.
 2 | use std::fmt::Debug;
 3 | 
 4 | pub use cropped_image::*;
 5 | pub use image::*;
 6 | pub use typed_cropped_image::*;
 7 | pub use typed_image::*;
 8 | pub(crate) use unsafe_image::UnsafeImageMut;
 9 | 
10 | mod cropped_image;
11 | mod image;
12 | mod typed_cropped_image;
13 | mod typed_image;
14 | mod unsafe_image;
15 | 
16 | #[cfg(feature = "image")]
17 | mod image_crate;
18 | 
19 | #[derive(Debug)]
20 | enum BufferContainer<'a, T: Copy + Debug> {
21 |     Borrowed(&'a mut [T]),
22 |     Owned(Vec<T>),
23 | }
24 | 
25 | impl<'a, T: Copy + Debug> BufferContainer<'a, T> {
26 |     fn as_vec(&self) -> Vec<T> {
27 |         match self {
28 |             Self::Borrowed(slice) => slice.to_vec(),
29 |             Self::Owned(vec) => vec.clone(),
30 |         }
31 |     }
32 | 
33 |     pub fn borrow(&self) -> &[T] {
34 |         match self {
35 |             Self::Borrowed(p_ref) => p_ref,
36 |             Self::Owned(vec) => vec,
37 |         }
38 |     }
39 | 
40 |     pub fn borrow_mut(&mut self) -> &mut [T] {
41 |         match self {
42 |             Self::Borrowed(p_ref) => p_ref,
43 |             Self::Owned(vec) => vec,
44 |         }
45 |     }
46 | }
47 | 
48 | enum View<'a, V: 'a> {
49 |     Borrowed(&'a V),
50 |     Owned(V),
51 | }
52 | 
53 | impl<'a, V> View<'a, V> {
54 |     fn get_ref(&self) -> &V {
55 |         match self {
56 |             Self::Borrowed(v_ref) => v_ref,
57 |             Self::Owned(v_own) => v_own,
58 |         }
59 |     }
60 | }
61 | 
62 | enum ViewMut<'a, V: 'a> {
63 |     Borrowed(&'a mut V),
64 |     Owned(V),
65 | }
66 | 
67 | impl<'a, V> ViewMut<'a, V> {
68 |     fn get_ref(&self) -> &V {
69 |         match self {
70 |             Self::Borrowed(v_ref) => v_ref,
71 |             Self::Owned(v_own) => v_own,
72 |         }
73 |     }
74 | 
75 |     fn get_mut(&mut self) -> &mut V {
76 |         match self {
77 |             Self::Borrowed(p_ref) => p_ref,
78 |             Self::Owned(vec) => vec,
79 |         }
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/images/unsafe_image.rs:
--------------------------------------------------------------------------------
  1 | use crate::{ArrayChunks, ImageView, ImageViewMut};
  2 | use std::marker::PhantomData;
  3 | use std::num::NonZeroU32;
  4 | 
  5 | #[derive(Copy)]
  6 | pub(crate) struct UnsafeImageMut<'a, V>
  7 | where
  8 |     V: ImageViewMut,
  9 | {
 10 |     image: std::ptr::NonNull<V>,
 11 |     p: PhantomData<&'a V>,
 12 | }
 13 | 
 14 | impl<'a, V> Clone for UnsafeImageMut<'a, V>
 15 | where
 16 |     V: ImageViewMut,
 17 | {
 18 |     fn clone(&self) -> Self {
 19 |         Self {
 20 |             image: self.image,
 21 |             p: PhantomData,
 22 |         }
 23 |     }
 24 | }
 25 | 
 26 | unsafe impl<'a, V: ImageViewMut> Send for UnsafeImageMut<'a, V> {}
 27 | unsafe impl<'a, V: ImageViewMut> Sync for UnsafeImageMut<'a, V> {}
 28 | 
 29 | impl<'a, V: ImageViewMut> UnsafeImageMut<'a, V> {
 30 |     pub fn new(image: &'a mut V) -> Self {
 31 |         let ptr = std::ptr::NonNull::new(image as *mut V).unwrap();
 32 |         Self {
 33 |             image: ptr,
 34 |             p: PhantomData,
 35 |         }
 36 |     }
 37 | 
 38 |     fn get(&self) -> &V {
 39 |         unsafe { self.image.as_ref() }
 40 |     }
 41 | 
 42 |     fn get_mut(&mut self) -> &mut V {
 43 |         unsafe { self.image.as_mut() }
 44 |     }
 45 | }
 46 | 
 47 | unsafe impl<'a, V: ImageViewMut> ImageView for UnsafeImageMut<'a, V> {
 48 |     type Pixel = V::Pixel;
 49 | 
 50 |     fn width(&self) -> u32 {
 51 |         self.get().width()
 52 |     }
 53 | 
 54 |     fn height(&self) -> u32 {
 55 |         self.get().height()
 56 |     }
 57 | 
 58 |     fn iter_rows(&self, start_row: u32) -> impl Iterator<Item = &[Self::Pixel]> {
 59 |         self.get().iter_rows(start_row)
 60 |     }
 61 | 
 62 |     fn iter_2_rows(
 63 |         &self,
 64 |         start_y: u32,
 65 |         max_rows: u32,
 66 |     ) -> ArrayChunks<impl Iterator<Item = &[Self::Pixel]>, 2> {
 67 |         self.get().iter_2_rows(start_y, max_rows)
 68 |     }
 69 | 
 70 |     fn iter_4_rows(
 71 |         &self,
 72 |         start_y: u32,
 73 |         max_rows: u32,
 74 |     ) -> ArrayChunks<impl Iterator<Item = &[Self::Pixel]>, 4> {
 75 |         self.get().iter_4_rows(start_y, max_rows)
 76 |     }
 77 | 
 78 |     fn iter_rows_with_step(
 79 |         &self,
 80 |         start_y: f64,
 81 |         step: f64,
 82 |         max_rows: u32,
 83 |     ) -> impl Iterator<Item = &[Self::Pixel]> {
 84 |         self.get().iter_rows_with_step(start_y, step, max_rows)
 85 |     }
 86 | 
 87 |     fn split_by_height(
 88 |         &self,
 89 |         start_row: u32,
 90 |         height: NonZeroU32,
 91 |         num_parts: NonZeroU32,
 92 |     ) -> Option<Vec<impl ImageView<Pixel = Self::Pixel>>> {
 93 |         self.get().split_by_height(start_row, height, num_parts)
 94 |     }
 95 | 
 96 |     fn split_by_width(
 97 |         &self,
 98 |         start_col: u32,
 99 |         width: NonZeroU32,
100 |         num_parts: NonZeroU32,
101 |     ) -> Option<Vec<impl ImageView<Pixel = Self::Pixel>>> {
102 |         self.get().split_by_width(start_col, width, num_parts)
103 |     }
104 | }
105 | 
106 | unsafe impl<'a, V: ImageViewMut> ImageViewMut for UnsafeImageMut<'a, V> {
107 |     fn iter_rows_mut(&mut self, start_row: u32) -> impl Iterator<Item = &mut [Self::Pixel]> {
108 |         self.get_mut().iter_rows_mut(start_row)
109 |     }
110 | 
111 |     fn iter_2_rows_mut(&mut self) -> ArrayChunks<impl Iterator<Item = &mut [Self::Pixel]>, 2> {
112 |         self.get_mut().iter_2_rows_mut()
113 |     }
114 | 
115 |     fn iter_4_rows_mut(&mut self) -> ArrayChunks<impl Iterator<Item = &mut [Self::Pixel]>, 4> {
116 |         self.get_mut().iter_4_rows_mut()
117 |     }
118 | 
119 |     fn split_by_height_mut(
120 |         &mut self,
121 |         start_row: u32,
122 |         height: NonZeroU32,
123 |         num_parts: NonZeroU32,
124 |     ) -> Option<Vec<impl ImageViewMut<Pixel = Self::Pixel>>> {
125 |         self.get_mut()
126 |             .split_by_height_mut(start_row, height, num_parts)
127 |     }
128 | 
129 |     fn split_by_width_mut(
130 |         &mut self,
131 |         start_col: u32,
132 |         width: NonZeroU32,
133 |         num_parts: NonZeroU32,
134 |     ) -> Option<Vec<impl ImageViewMut<Pixel = Self::Pixel>>> {
135 |         self.get_mut()
136 |             .split_by_width_mut(start_col, width, num_parts)
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![doc = include_str!("../README.md")]
 2 | //!
 3 | //! ## Feature flags
 4 | #![doc = document_features::document_features!()]
 5 | 
 6 | pub use alpha::errors::*;
 7 | pub use array_chunks::*;
 8 | pub use change_components_type::*;
 9 | pub use color::mappers::*;
10 | pub use color::PixelComponentMapper;
11 | pub use convolution::*;
12 | pub use cpu_extensions::CpuExtensions;
13 | pub use crop_box::*;
14 | pub use errors::*;
15 | pub use image_view::*;
16 | pub use mul_div::MulDiv;
17 | pub use pixels::PixelType;
18 | pub use resizer::{ResizeAlg, ResizeOptions, Resizer, SrcCropping};
19 | 
20 | use crate::alpha::AlphaMulDiv;
21 | 
22 | #[macro_use]
23 | mod utils;
24 | 
25 | mod alpha;
26 | mod array_chunks;
27 | mod change_components_type;
28 | mod color;
29 | mod convolution;
30 | mod cpu_extensions;
31 | mod crop_box;
32 | mod errors;
33 | mod image_view;
34 | pub mod images;
35 | mod mul_div;
36 | #[cfg(target_arch = "aarch64")]
37 | mod neon_utils;
38 | pub mod pixels;
39 | mod resizer;
40 | #[cfg(target_arch = "x86_64")]
41 | mod simd_utils;
42 | #[cfg(feature = "for_testing")]
43 | pub mod testing;
44 | #[cfg(feature = "rayon")]
45 | pub(crate) mod threading;
46 | #[cfg(target_arch = "wasm32")]
47 | mod wasm32_utils;
48 | 
49 | /// A trait implemented by all pixel types from the crate.
50 | ///
51 | /// This trait must be used in your code instead of [InnerPixel](pixels::InnerPixel).
52 | #[allow(private_bounds)]
53 | pub trait PixelTrait: Convolution + AlphaMulDiv {}
54 | 
55 | impl<P: Convolution + AlphaMulDiv> PixelTrait for P {}
56 | 


--------------------------------------------------------------------------------
/src/simd_utils.rs:
--------------------------------------------------------------------------------
 1 | use std::arch::x86_64::*;
 2 | use std::intrinsics::transmute;
 3 | 
 4 | use crate::pixels::{U8x3, U8x4};
 5 | 
 6 | #[inline(always)]
 7 | pub unsafe fn loadu_si128<T>(buf: &[T], index: usize) -> __m128i {
 8 |     _mm_loadu_si128(buf.get_unchecked(index..).as_ptr() as *const __m128i)
 9 | }
10 | 
11 | #[inline(always)]
12 | pub unsafe fn loadu_si256<T>(buf: &[T], index: usize) -> __m256i {
13 |     _mm256_loadu_si256(buf.get_unchecked(index..).as_ptr() as *const __m256i)
14 | }
15 | 
16 | #[inline(always)]
17 | pub unsafe fn loadl_epi16<T>(buf: &[T], index: usize) -> __m128i {
18 |     let mem_addr = buf.get_unchecked(index..).as_ptr() as *const i16;
19 |     _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, mem_addr.read_unaligned())
20 | }
21 | 
22 | #[inline(always)]
23 | pub unsafe fn loadl_epi32<T>(buf: &[T], index: usize) -> __m128i {
24 |     let mem_addr = buf.get_unchecked(index..).as_ptr() as *const i32;
25 |     _mm_set_epi32(0, 0, 0, mem_addr.read_unaligned())
26 | }
27 | 
28 | #[inline(always)]
29 | pub unsafe fn loadl_epi64<T>(buf: &[T], index: usize) -> __m128i {
30 |     _mm_loadl_epi64(buf.get_unchecked(index..).as_ptr() as *const __m128i)
31 | }
32 | 
33 | #[inline(always)]
34 | pub unsafe fn loadu_ps<T>(buf: &[T], index: usize) -> __m128 {
35 |     _mm_loadu_ps(buf.get_unchecked(index..).as_ptr() as *const f32)
36 | }
37 | 
38 | #[inline(always)]
39 | pub unsafe fn loadu_ps256<T>(buf: &[T], index: usize) -> __m256 {
40 |     _mm256_loadu_ps(buf.get_unchecked(index..).as_ptr() as *const f32)
41 | }
42 | 
43 | #[inline(always)]
44 | pub unsafe fn loadu_pd<T>(buf: &[T], index: usize) -> __m128d {
45 |     _mm_loadu_pd(buf.get_unchecked(index..).as_ptr() as *const f64)
46 | }
47 | 
48 | #[inline(always)]
49 | pub unsafe fn loadu_pd256<T>(buf: &[T], index: usize) -> __m256d {
50 |     _mm256_loadu_pd(buf.get_unchecked(index..).as_ptr() as *const f64)
51 | }
52 | 
53 | #[inline(always)]
54 | pub unsafe fn mm_cvtepu8_epi32(buf: &[U8x4], index: usize) -> __m128i {
55 |     let v: i32 = transmute(buf.get_unchecked(index).0);
56 |     _mm_cvtepu8_epi32(_mm_cvtsi32_si128(v))
57 | }
58 | 
59 | #[inline(always)]
60 | pub unsafe fn mm_cvtepu8_epi32_u8x3(buf: &[U8x3], index: usize) -> __m128i {
61 |     let pixel = buf.get_unchecked(index).0;
62 |     let v: i32 = i32::from_le_bytes([pixel[0], pixel[1], pixel[2], 0]);
63 |     _mm_cvtepu8_epi32(_mm_cvtsi32_si128(v))
64 | }
65 | 
66 | #[inline(always)]
67 | pub unsafe fn mm_cvtepu8_epi32_from_u8(buf: &[u8], index: usize) -> __m128i {
68 |     let ptr = buf.get_unchecked(index..).as_ptr() as *const i32;
69 |     _mm_cvtepu8_epi32(_mm_cvtsi32_si128(ptr.read_unaligned()))
70 | }
71 | 
72 | #[inline(always)]
73 | pub unsafe fn mm_cvtsi32_si128_from_u8(buf: &[u8], index: usize) -> __m128i {
74 |     let ptr = buf.get_unchecked(index..).as_ptr() as *const i32;
75 |     _mm_cvtsi32_si128(ptr.read_unaligned())
76 | }
77 | 
78 | #[inline(always)]
79 | pub unsafe fn mm_load_and_clone_i16x2(buf: &[i16]) -> __m128i {
80 |     debug_assert!(buf.len() >= 2);
81 |     _mm_set1_epi32((buf.as_ptr() as *const i32).read_unaligned())
82 | }
83 | 
84 | #[inline(always)]
85 | pub unsafe fn mm256_load_and_clone_i16x2(buf: &[i16]) -> __m256i {
86 |     debug_assert!(buf.len() >= 2);
87 |     _mm256_set1_epi32((buf.as_ptr() as *const i32).read_unaligned())
88 | }
89 | 
90 | #[inline(always)]
91 | pub unsafe fn ptr_i16_to_set1_epi64x(buf: &[i16], index: usize) -> __m128i {
92 |     _mm_set1_epi64x((buf.get_unchecked(index..).as_ptr() as *const i64).read_unaligned())
93 | }
94 | 
95 | #[inline(always)]
96 | pub unsafe fn ptr_i16_to_256set1_epi64x(buf: &[i16], index: usize) -> __m256i {
97 |     _mm256_set1_epi64x((buf.get_unchecked(index..).as_ptr() as *const i64).read_unaligned())
98 | }
99 | 


--------------------------------------------------------------------------------
/src/testing.rs:
--------------------------------------------------------------------------------
 1 | use std::cell::RefCell;
 2 | 
 3 | thread_local!(static TEST_LOGS: RefCell<Vec<String>> = const { RefCell::new(Vec::new()) });
 4 | 
 5 | pub fn log_message(msg: &str) {
 6 |     TEST_LOGS.with(|f| {
 7 |         let mut logs = f.borrow_mut();
 8 |         logs.push(msg.to_string());
 9 |     });
10 | }
11 | 
12 | pub fn logs_contain(msg: &str) -> bool {
13 |     TEST_LOGS.with(|f| {
14 |         let logs = f.borrow();
15 |         for line in logs.iter() {
16 |             if line.contains(msg) {
17 |                 return true;
18 |             }
19 |         }
20 |         false
21 |     })
22 | }
23 | 
24 | pub fn clear_log() {
25 |     TEST_LOGS.with(|f| {
26 |         let mut logs = f.borrow_mut();
27 |         logs.clear();
28 |     })
29 | }
30 | 


--------------------------------------------------------------------------------
/src/threading.rs:
--------------------------------------------------------------------------------
  1 | use crate::pixels::InnerPixel;
  2 | use crate::{ImageView, ImageViewMut};
  3 | use rayon::current_num_threads;
  4 | use rayon::prelude::*;
  5 | use std::num::NonZeroU32;
  6 | 
  7 | #[inline]
  8 | pub(crate) fn split_h_two_images_for_threading<'a, P: InnerPixel>(
  9 |     src_view: &'a impl ImageView<Pixel = P>,
 10 |     dst_view: &'a mut impl ImageViewMut<Pixel = P>,
 11 |     src_offset: u32,
 12 | ) -> Option<
 13 |     impl ParallelIterator<
 14 |         Item = (
 15 |             impl ImageView<Pixel = P> + 'a,
 16 |             impl ImageViewMut<Pixel = P> + 'a,
 17 |         ),
 18 |     >,
 19 | > {
 20 |     debug_assert!(src_view.height() - src_offset >= dst_view.height());
 21 | 
 22 |     let dst_width = dst_view.width();
 23 |     let dst_height = dst_view.height();
 24 |     let max_num_parts = calculate_max_h_parts_number(dst_width, dst_height);
 25 | 
 26 |     let num_threads = current_num_threads() as u32;
 27 |     if num_threads > 1 && max_num_parts > 1 {
 28 |         let num_parts = NonZeroU32::new(num_threads.min(max_num_parts)).unwrap();
 29 |         let dst_height = NonZeroU32::new(dst_height).unwrap();
 30 |         if let Some(src_parts) = src_view.split_by_height(src_offset, dst_height, num_parts) {
 31 |             if let Some(dst_parts) = dst_view.split_by_height_mut(0, dst_height, num_parts) {
 32 |                 let src_iter = src_parts.into_par_iter();
 33 |                 let dst_iter = dst_parts.into_par_iter();
 34 |                 return Some(src_iter.zip(dst_iter));
 35 |             }
 36 |         }
 37 |     }
 38 |     None
 39 | }
 40 | 
 41 | #[inline]
 42 | pub(crate) fn split_h_one_image_for_threading<P: InnerPixel>(
 43 |     image_view: &mut impl ImageViewMut<Pixel = P>,
 44 | ) -> Option<impl ParallelIterator<Item = impl ImageViewMut<Pixel = P> + '_>> {
 45 |     let width = image_view.width();
 46 |     let height = image_view.height();
 47 |     let max_num_parts = calculate_max_h_parts_number(width, height);
 48 | 
 49 |     let num_threads = current_num_threads() as u32;
 50 |     if num_threads > 1 && max_num_parts > 1 {
 51 |         let num_parts = NonZeroU32::new(num_threads.min(max_num_parts)).unwrap();
 52 |         let height = NonZeroU32::new(height).unwrap();
 53 |         let img_parts = image_view.split_by_height_mut(0, height, num_parts);
 54 |         return img_parts.map(|parts| parts.into_par_iter());
 55 |     }
 56 |     None
 57 | }
 58 | 
 59 | /// It is not optimal to split images on too small parts.
 60 | /// We have to calculate minimal height of one part.
 61 | /// For small images, it is equal to `constant / area`.
 62 | /// For tall images, it is equal to `height / 256`.
 63 | fn calculate_max_h_parts_number(width: u32, height: u32) -> u32 {
 64 |     if width == 0 || height == 0 {
 65 |         return 1;
 66 |     }
 67 |     let area = height * height.max(width);
 68 |     let min_height = ((1 << 14) / area).max(height / 256);
 69 |     height / min_height.max(1)
 70 | }
 71 | 
 72 | #[inline]
 73 | pub(crate) fn split_v_two_images_for_threading<'a, P: InnerPixel>(
 74 |     src_view: &'a impl ImageView<Pixel = P>,
 75 |     dst_view: &'a mut impl ImageViewMut<Pixel = P>,
 76 |     src_offset: u32,
 77 | ) -> Option<
 78 |     impl ParallelIterator<
 79 |         Item = (
 80 |             impl ImageView<Pixel = P> + 'a,
 81 |             impl ImageViewMut<Pixel = P> + 'a,
 82 |         ),
 83 |     >,
 84 | > {
 85 |     debug_assert!(src_view.width() - src_offset >= dst_view.width());
 86 | 
 87 |     let dst_width = dst_view.width();
 88 |     let dst_height = dst_view.height();
 89 |     let max_num_parts = calculate_max_v_parts_number(dst_width, dst_height);
 90 | 
 91 |     let num_threads = current_num_threads() as u32;
 92 |     if num_threads > 1 && max_num_parts > 1 {
 93 |         let num_parts = NonZeroU32::new(num_threads.min(max_num_parts)).unwrap();
 94 |         let dst_width = NonZeroU32::new(dst_width).unwrap();
 95 |         if let Some(src_parts) = src_view.split_by_width(src_offset, dst_width, num_parts) {
 96 |             if let Some(dst_parts) = dst_view.split_by_width_mut(0, dst_width, num_parts) {
 97 |                 let src_iter = src_parts.into_par_iter();
 98 |                 let dst_iter = dst_parts.into_par_iter();
 99 |                 return Some(src_iter.zip(dst_iter));
100 |             }
101 |         }
102 |     }
103 |     None
104 | }
105 | 
106 | /// It is not optimal to split images on too small parts.
107 | /// We have to calculate minimal width of one part.
108 | /// For small images, it is equal to `constant / area`.
109 | /// For wide images, it is equal to `width / 256`.
110 | fn calculate_max_v_parts_number(width: u32, height: u32) -> u32 {
111 |     if width == 0 || height == 0 {
112 |         return 1;
113 |     }
114 |     let area = width * height.max(width);
115 |     let min_width = ((1 << 14) / area).max(width / 256);
116 |     width / min_width.max(1)
117 | }
118 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
 1 | /// Pre-reading data from memory increases speed slightly for some operations
 2 | #[inline(always)]
 3 | pub(crate) fn foreach_with_pre_reading<D, I>(
 4 |     mut iter: impl Iterator<Item = I>,
 5 |     mut read_data: impl FnMut(I) -> D,
 6 |     mut process_data: impl FnMut(D),
 7 | ) {
 8 |     let mut next_data: D;
 9 |     if let Some(src) = iter.next() {
10 |         next_data = read_data(src);
11 |         for src in iter {
12 |             let data = next_data;
13 |             next_data = read_data(src);
14 |             process_data(data);
15 |         }
16 |         process_data(next_data);
17 |     }
18 | }
19 | 
20 | macro_rules! test_log {
21 |     ($s:expr) => {
22 |         #[cfg(feature = "for_testing")]
23 |         {
24 |             use crate::testing::log_message;
25 |             log_message($s);
26 |         }
27 |     };
28 | }
29 | 


--------------------------------------------------------------------------------
/src/wasm32_utils.rs:
--------------------------------------------------------------------------------
 1 | use std::arch::wasm32::*;
 2 | 
 3 | use crate::pixels::{U8x3, U8x4};
 4 | 
 5 | #[inline]
 6 | #[target_feature(enable = "simd128")]
 7 | pub(crate) unsafe fn load_v128<T>(buf: &[T], index: usize) -> v128 {
 8 |     v128_load(buf.get_unchecked(index..).as_ptr() as *const v128)
 9 | }
10 | 
11 | #[inline]
12 | #[target_feature(enable = "simd128")]
13 | pub(crate) unsafe fn loadl_i64<T>(buf: &[T], index: usize) -> v128 {
14 |     let p = buf.get_unchecked(index..).as_ptr() as *const i64;
15 |     i64x2(p.read_unaligned(), 0)
16 | }
17 | 
18 | #[inline]
19 | #[target_feature(enable = "simd128")]
20 | pub(crate) unsafe fn loadl_i32<T>(buf: &[T], index: usize) -> v128 {
21 |     let p = buf.get_unchecked(index..).as_ptr() as *const i32;
22 |     i32x4(p.read_unaligned(), 0, 0, 0)
23 | }
24 | 
25 | #[inline]
26 | #[target_feature(enable = "simd128")]
27 | pub(crate) unsafe fn loadl_i16<T>(buf: &[T], index: usize) -> v128 {
28 |     let p = buf.get_unchecked(index..).as_ptr() as *const i16;
29 |     i16x8(p.read_unaligned(), 0, 0, 0, 0, 0, 0, 0)
30 | }
31 | 
32 | #[inline]
33 | #[target_feature(enable = "simd128")]
34 | pub(crate) unsafe fn ptr_i16_to_set1_i64(buf: &[i16], index: usize) -> v128 {
35 |     let p = buf.get_unchecked(index..).as_ptr() as *const i64;
36 |     i64x2_splat(p.read_unaligned())
37 | }
38 | 
39 | #[inline]
40 | #[target_feature(enable = "simd128")]
41 | pub(crate) unsafe fn ptr_i16_to_set1_i32(buf: &[i16], index: usize) -> v128 {
42 |     let p = buf.get_unchecked(index..).as_ptr() as *const i32;
43 |     i32x4_splat(p.read_unaligned())
44 | }
45 | 
46 | #[inline]
47 | #[target_feature(enable = "simd128")]
48 | pub(crate) unsafe fn i32x4_extend_low_ptr_u8(buf: &[u8], index: usize) -> v128 {
49 |     let p = buf.get_unchecked(index..).as_ptr() as *const v128;
50 |     u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(v128_load(p)))
51 | }
52 | 
53 | #[inline]
54 | #[target_feature(enable = "simd128")]
55 | pub(crate) unsafe fn i32x4_extend_low_ptr_u8x4(buf: &[U8x4], index: usize) -> v128 {
56 |     let v: u32 = u32::from_le_bytes(buf.get_unchecked(index).0);
57 |     u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(u32x4(v, 0, 0, 0)))
58 | }
59 | 
60 | #[inline]
61 | #[target_feature(enable = "simd128")]
62 | pub(crate) unsafe fn i32x4_extend_low_ptr_u8x3(buf: &[U8x3], index: usize) -> v128 {
63 |     let pixel = buf.get_unchecked(index).0;
64 |     i32x4(pixel[0] as i32, pixel[1] as i32, pixel[2] as i32, 0)
65 | }
66 | 
67 | #[inline]
68 | #[target_feature(enable = "simd128")]
69 | pub(crate) unsafe fn i32x4_v128_from_u8(buf: &[u8], index: usize) -> v128 {
70 |     let p = buf.get_unchecked(index..).as_ptr() as *const i32;
71 |     i32x4(p.read_unaligned(), 0, 0, 0)
72 | }
73 | 
74 | // #[inline]
75 | // #[target_feature(enable = "simd128")]
76 | // pub(crate) unsafe fn u16x8_mul_shr16(a_u16x8: v128, b_u16x8: v128) -> v128 {
77 | //     let lo_u32x4 = u32x4_extmul_low_u16x8(a_u16x8, b_u16x8);
78 | //     let hi_u32x4 = u32x4_extmul_high_u16x8(a_u16x8, b_u16x8);
79 | //     i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(lo_u32x4, hi_u32x4)
80 | // }
81 | 
82 | pub(crate) unsafe fn u16x8_mul_add_shr16(a_u16x8: v128, b_u16x8: v128, c: v128) -> v128 {
83 |     let lo_u32x4 = u32x4_extmul_low_u16x8(a_u16x8, b_u16x8);
84 |     let hi_u32x4 = u32x4_extmul_high_u16x8(a_u16x8, b_u16x8);
85 |     let lo_u32x4 = u32x4_add(lo_u32x4, c);
86 |     let hi_u32x4 = u32x4_add(hi_u32x4, c);
87 |     i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(lo_u32x4, hi_u32x4)
88 | }
89 | 
90 | #[inline]
91 | #[target_feature(enable = "simd128")]
92 | pub(crate) unsafe fn i64x2_mul_lo(a: v128, b: v128) -> v128 {
93 |     const SHUFFLE: v128 = i8x16(0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1);
94 |     i64x2_extmul_low_i32x4(i8x16_swizzle(a, SHUFFLE), i8x16_swizzle(b, SHUFFLE))
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/image_view.rs:
--------------------------------------------------------------------------------
  1 | use fast_image_resize::images::{TypedCroppedImageMut, TypedImage};
  2 | use fast_image_resize::pixels::U8;
  3 | use fast_image_resize::{ImageView, ImageViewMut};
  4 | use testing::non_zero_u32;
  5 | 
  6 | mod testing;
  7 | 
  8 | mod split_by_width {
  9 |     use super::*;
 10 |     use fast_image_resize::images::{TypedCroppedImage, TypedImageRef};
 11 | 
 12 |     fn split<T: ImageView>(img: &T) {
 13 |         for num_parts in 1..16 {
 14 |             let res = img
 15 |                 .split_by_width(0, non_zero_u32(512), non_zero_u32(num_parts))
 16 |                 .unwrap();
 17 |             assert_eq!(res.len() as u32, num_parts);
 18 |             let sum_width = res.iter().map(|v| v.width()).sum::<u32>();
 19 |             assert_eq!(sum_width, 512);
 20 |         }
 21 |     }
 22 | 
 23 |     fn split_mut<T: ImageViewMut>(img: &mut T) {
 24 |         for num_parts in 1..16 {
 25 |             let res = img
 26 |                 .split_by_width_mut(0, non_zero_u32(512), non_zero_u32(num_parts))
 27 |                 .unwrap();
 28 |             assert_eq!(res.len() as u32, num_parts);
 29 |             let sum_width = res.iter().map(|v| v.width()).sum::<u32>();
 30 |             assert_eq!(sum_width, 512);
 31 |         }
 32 |     }
 33 | 
 34 |     #[test]
 35 |     fn typed_image_ref() {
 36 |         let width = 512;
 37 |         let height = 384;
 38 |         let buffer = vec![U8::new(0); (width * height) as usize];
 39 |         let img = TypedImageRef::<U8>::new(width, height, &buffer).unwrap();
 40 |         split(&img);
 41 |     }
 42 | 
 43 |     #[test]
 44 |     fn typed_image() {
 45 |         let mut img = TypedImage::<U8>::new(512, 384);
 46 |         split(&img);
 47 |         split_mut(&mut img);
 48 |     }
 49 | 
 50 |     #[test]
 51 |     fn typed_cropped_image() {
 52 |         let img = TypedImage::<U8>::new(512 + 20, 384 + 20);
 53 |         let cropped_img = TypedCroppedImage::from_ref(&img, 10, 10, 512, 384).unwrap();
 54 |         split(&cropped_img);
 55 |     }
 56 | 
 57 |     #[test]
 58 |     fn typed_cropped_image_mut() {
 59 |         let mut img = TypedImage::<U8>::new(512 + 20, 384 + 20);
 60 |         let mut cropped_img = TypedCroppedImageMut::from_ref(&mut img, 10, 10, 512, 384).unwrap();
 61 |         split(&cropped_img);
 62 |         split_mut(&mut cropped_img);
 63 |     }
 64 | }
 65 | 
 66 | mod split_by_height {
 67 |     use super::*;
 68 |     use fast_image_resize::images::{TypedCroppedImage, TypedImageRef};
 69 | 
 70 |     fn split<T: ImageView>(img: &T) {
 71 |         for num_parts in 1..16 {
 72 |             let res = img
 73 |                 .split_by_height(0, non_zero_u32(512), non_zero_u32(num_parts))
 74 |                 .unwrap();
 75 |             assert_eq!(res.len() as u32, num_parts);
 76 |             let sum_height = res.iter().map(|v| v.height()).sum::<u32>();
 77 |             assert_eq!(sum_height, 512);
 78 |         }
 79 |     }
 80 | 
 81 |     fn split_mut<T: ImageViewMut>(img: &mut T) {
 82 |         for num_parts in 1..16 {
 83 |             let res = img
 84 |                 .split_by_height_mut(0, non_zero_u32(512), non_zero_u32(num_parts))
 85 |                 .unwrap();
 86 |             assert_eq!(res.len() as u32, num_parts);
 87 |             let sum_height = res.iter().map(|v| v.height()).sum::<u32>();
 88 |             assert_eq!(sum_height, 512);
 89 |         }
 90 |     }
 91 | 
 92 |     #[test]
 93 |     fn typed_image_ref() {
 94 |         let width = 384;
 95 |         let height = 512;
 96 |         let buffer = vec![U8::new(0); (width * height) as usize];
 97 |         let img = TypedImageRef::<U8>::new(width, height, &buffer).unwrap();
 98 |         split(&img);
 99 |     }
100 | 
101 |     #[test]
102 |     fn typed_image() {
103 |         let mut img: TypedImage<U8> = TypedImage::new(384, 512);
104 |         split(&img);
105 |         split_mut(&mut img);
106 |     }
107 | 
108 |     #[test]
109 |     fn typed_cropped_image() {
110 |         let img = TypedImage::<U8>::new(384 + 20, 512 + 20);
111 |         let cropped_img = TypedCroppedImage::from_ref(&img, 10, 10, 384, 512).unwrap();
112 |         split(&cropped_img);
113 |     }
114 | 
115 |     #[test]
116 |     fn typed_cropped_image_mut() {
117 |         let mut img: TypedImage<U8> = TypedImage::new(384 + 20, 512 + 20);
118 |         let mut cropped_img = TypedCroppedImageMut::from_ref(&mut img, 10, 10, 384, 512).unwrap();
119 |         split(&cropped_img);
120 |         split_mut(&mut cropped_img);
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------