├── .gitignore ├── assets └── 155603.jpg ├── docker-compose.yml ├── Dockerfile ├── .gitmodules ├── .vscode └── settings.json ├── src ├── benchmark │ ├── bilinear_plain_single_thread.hpp │ ├── bilinear_avx2_single_thread.hpp │ ├── bilinear_avx512_single_thread.hpp │ ├── bilinear_sse4_single_thread.hpp │ ├── bilinear_plain_multi_thread.hpp │ ├── bilinear_avx2_multi_thread.hpp │ ├── bilinear_avx512_multi_thread.hpp │ └── bilinear_sse4_multi_thread.hpp ├── interpolate │ ├── types.hpp │ ├── bilinear_plain.hpp │ ├── bilinear_sse4.hpp │ ├── bilinear_avx2.hpp │ └── bilinear_avx512.hpp ├── common.hpp └── main.cpp ├── CMakeLists.txt ├── .clang-format ├── Readme.md └── Benchmarks.md /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | !/build/.keep 3 | -------------------------------------------------------------------------------- /assets/155603.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jviney/bilinear_filter_simd/HEAD/assets/155603.jpg -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | base: 4 | build: . 5 | volumes: 6 | - ".:/src" 7 | command: tail -f /dev/null 8 | working_dir: /src 9 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:23.04 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | RUN apt-get update 4 | RUN apt-get install -y --no-install-recommends build-essential cmake libopencv-dev 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "build/vendor/benchmark"] 2 | path = build/vendor/benchmark 3 | url = https://github.com/google/benchmark 4 | [submodule "vendor/benchmark"] 5 | path = vendor/benchmark 6 | url = https://github.com/google/benchmark 7 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "C_Cpp.clang_format_style": "file", 3 | 4 | "editor.rulers": [100], 5 | "editor.tabSize": 2, 6 | "editor.formatOnSave": true, 7 | 8 | "files.exclude": { 9 | "build": true, 10 | "vendor": true 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_plain_single_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_plain.hpp" 5 | 6 | cv::Mat3b bilinear_plain_single_thread(const BenchmarkInput& input) { 7 | auto output_image = cv::Mat3b(input.output_size); 8 | 9 | for (auto y = 0; y < output_image.rows; y++) { 10 | const auto* px_coords_row = input.coords.ptr(y); 11 | auto* output_px_row = output_image.ptr(y); 12 | 13 | for (int x = 0; x < output_image.cols; x += 4) { 14 | const auto* px_coords = px_coords_row + x; 15 | auto* output_pixels = output_px_row + x; 16 | 17 | interpolate::bilinear::plain::interpolate_multiple<4>( 18 | input.source_image, reinterpret_cast(output_pixels), 19 | reinterpret_cast(px_coords)); 20 | } 21 | } 22 | 23 | return output_image; 24 | } 25 | 26 | static void BM_bilinear_plain_single_thread(benchmark::State& state, const BenchmarkInput& input) { 27 | for (auto _ : state) { 28 | bilinear_plain_single_thread(input); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_avx2_single_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_avx2.hpp" 5 | 6 | cv::Mat3b bilinear_avx2_single_thread(const BenchmarkInput& input) { 7 | auto output_image = cv::Mat3b(input.output_size); 8 | 9 | static constexpr auto step = 4; 10 | 11 | for (auto y = 0; y < output_image.rows; y++) { 12 | const auto* px_coords_row = input.coords.ptr(y); 13 | auto* output_pixels_row = output_image.ptr(y); 14 | 15 | for (auto x = 0; x < output_image.cols; x += step) { 16 | const auto* px_coords = px_coords_row + x; 17 | auto* output_pixels = output_pixels_row + x; 18 | 19 | interpolate::bilinear::avx2::interpolate( 20 | input.source_image, reinterpret_cast(px_coords), 21 | reinterpret_cast(output_pixels)); 22 | } 23 | } 24 | 25 | return output_image; 26 | } 27 | 28 | static void BM_bilinear_avx2_single_thread(benchmark::State& state, const BenchmarkInput& input) { 29 | for (auto _ : state) { 30 | bilinear_avx2_single_thread(input); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | project(bilinear_filter_simd LANGUAGES CXX) 4 | 5 | add_executable(bilinear_filter_simd 6 | src/main.cpp 7 | ) 8 | 9 | if (NOT CMAKE_BUILD_TYPE) 10 | set(CMAKE_BUILD_TYPE Release) 11 | endif() 12 | 13 | target_include_directories(bilinear_filter_simd PRIVATE src/) 14 | 15 | # Enable compiler warnings 16 | set(WARNINGS -Wall -Wextra -Wpedantic) 17 | target_compile_options(bilinear_filter_simd PRIVATE ${WARNINGS}) 18 | 19 | # C++17 20 | set_target_properties(bilinear_filter_simd PROPERTIES CXX_STANDARD 17) 21 | set_target_properties(bilinear_filter_simd PROPERTIES CXX_STANDARD_REQUIRED ON) 22 | 23 | # Target native architecture 24 | target_compile_options(bilinear_filter_simd PUBLIC -march=native) 25 | 26 | # Use OpenCV 27 | find_package(OpenCV 4 REQUIRED) 28 | target_include_directories(bilinear_filter_simd PRIVATE ${OpenCV_INCLUDE_DIRS}) 29 | target_link_libraries(bilinear_filter_simd PRIVATE ${OpenCV_LIBRARIES}) 30 | 31 | # Benchmark 32 | set(BENCHMARK_ENABLE_GTEST_TESTS OFF) 33 | add_subdirectory(vendor/benchmark) 34 | target_link_libraries(bilinear_filter_simd PRIVATE benchmark::benchmark) 35 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_avx512_single_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_avx512.hpp" 5 | 6 | cv::Mat3b bilinear_avx512_single_thread(const BenchmarkInput& input) { 7 | auto output_image = cv::Mat3b(input.output_size); 8 | 9 | static constexpr auto step = 8; 10 | 11 | for (auto y = 0; y < output_image.rows; y++) { 12 | const auto* px_coords_row = input.coords.ptr(y); 13 | auto* output_pixels_row = output_image.ptr(y); 14 | 15 | for (auto x = 0; x < output_image.cols; x += step) { 16 | const auto* px_coords = px_coords_row + x; 17 | auto* output_pixels = output_pixels_row + x; 18 | 19 | interpolate::bilinear::avx512::interpolate( 20 | input.source_image, reinterpret_cast(px_coords), 21 | reinterpret_cast(output_pixels)); 22 | } 23 | } 24 | 25 | return output_image; 26 | } 27 | 28 | static void BM_bilinear_avx512_single_thread(benchmark::State& state, const BenchmarkInput& input) { 29 | for (auto _ : state) { 30 | bilinear_avx512_single_thread(input); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/interpolate/types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace interpolate 6 | { 7 | 8 | struct InputCoords { 9 | float y; 10 | float x; 11 | }; 12 | 13 | struct BGRPixel { 14 | uint8_t b; 15 | uint8_t g; 16 | uint8_t r; 17 | }; 18 | 19 | class BGRImage 20 | { 21 | public: 22 | int rows; 23 | int cols; 24 | int step; 25 | BGRPixel* data; // non-owner 26 | uintptr_t data_end; 27 | 28 | BGRImage(){}; 29 | BGRImage(int rows, int cols, int step, BGRPixel* data) 30 | : rows(rows), 31 | cols(cols), 32 | step(step), 33 | data(data), 34 | data_end(((uintptr_t) data) + rows * step) {} 35 | 36 | inline const BGRPixel* ptr(int row, int col) const { 37 | int offset = (row * step) + (col * 3); 38 | return (const BGRPixel*) (((const uint8_t*) data) + offset); 39 | } 40 | 41 | inline const BGRPixel* ptr_below(const BGRPixel* ptr) const { 42 | auto end = ((uintptr_t) ptr) + step; 43 | 44 | if (end < data_end) [[likely]] { 45 | return (const BGRPixel*) end; 46 | } else { 47 | return ptr; 48 | } 49 | } 50 | }; 51 | 52 | } // namespace interpolate 53 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_sse4_single_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_sse4.hpp" 5 | 6 | static cv::Mat3b bilinear_sse4_single_thread(const BenchmarkInput& input) { 7 | auto output_image = cv::Mat3b(input.output_size); 8 | 9 | auto* last_output_pixel = 10 | output_image.ptr(output_image.rows - 1, output_image.cols - 1); 11 | 12 | for (auto y = 0; y < output_image.rows; y++) { 13 | auto* output_row_start = output_image.ptr(y); 14 | const auto* px_coords_row = input.coords.ptr(y); 15 | 16 | for (auto x = 0; x < output_image.cols; x += 2) { 17 | const auto* px_coords = px_coords_row + x; 18 | auto* output_pixels = output_row_start + x; 19 | 20 | auto is_last_output_pixel = (output_pixels + 1 == last_output_pixel); 21 | 22 | interpolate::bilinear::sse4::interpolate( 23 | input.source_image, reinterpret_cast(px_coords), 24 | reinterpret_cast(output_pixels), !is_last_output_pixel); 25 | } 26 | } 27 | 28 | return output_image; 29 | } 30 | 31 | static void BM_bilinear_sse4_single_thread(benchmark::State& state, const BenchmarkInput& input) { 32 | for (auto _ : state) { 33 | bilinear_sse4_single_thread(input); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_plain_multi_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_plain.hpp" 5 | 6 | class InterpolatePlainMultiThread : public cv::ParallelLoopBody 7 | { 8 | public: 9 | InterpolatePlainMultiThread(const interpolate::BGRImage& input_image, const cv::Mat2f& coords, 10 | cv::Mat3b& output_image) 11 | : input_image_(input_image), coords_(coords), output_image_(output_image) {} 12 | 13 | virtual void operator()(const cv::Range& range) const override { 14 | for (auto y = range.start; y < range.end; y++) { 15 | const auto* px_coords_row = coords_.ptr(y); 16 | auto* output_row = output_image_.ptr(y); 17 | 18 | for (auto x = 0; x < output_image_.cols; x += 4) { 19 | const auto* px_coords = px_coords_row + x; 20 | auto* output_pixel = output_row + x; 21 | 22 | interpolate::bilinear::plain::interpolate_multiple<4>( 23 | input_image_, reinterpret_cast(output_pixel), 24 | reinterpret_cast(px_coords)); 25 | } 26 | } 27 | } 28 | 29 | private: 30 | const interpolate::BGRImage& input_image_; 31 | const cv::Mat2f coords_; 32 | cv::Mat3b& output_image_; 33 | }; 34 | 35 | cv::Mat3b bilinear_plain_multi_thread(const BenchmarkInput& input) { 36 | auto output_image = cv::Mat3b(input.output_size); 37 | auto parallel_executor = 38 | InterpolatePlainMultiThread(input.source_image, input.coords, output_image); 39 | 40 | cv::parallel_for_(cv::Range(0, output_image.rows), parallel_executor); 41 | 42 | return output_image; 43 | } 44 | 45 | static void BM_bilinear_plain_multi_thread(benchmark::State& state, const BenchmarkInput& input) { 46 | for (auto _ : state) { 47 | bilinear_plain_multi_thread(input); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_avx2_multi_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_avx2.hpp" 5 | 6 | class InterpolateAVX2MultiThread : public cv::ParallelLoopBody 7 | { 8 | public: 9 | InterpolateAVX2MultiThread(const interpolate::BGRImage& input_image, const cv::Mat2f& coords, 10 | cv::Mat3b& output_image) 11 | : input_image_(input_image), coords_(coords), output_image_(output_image) { 12 | if (output_image.cols % step != 0) { 13 | throw std::runtime_error("output frame width must be multiple of 4"); 14 | } 15 | } 16 | 17 | virtual void operator()(const cv::Range& range) const override { 18 | for (auto y = range.start; y < range.end; y++) { 19 | auto* px_coords_row = coords_.ptr(y); 20 | auto* output_pixels_row = output_image_.ptr(y); 21 | 22 | for (auto x = 0; x < output_image_.cols; x += step) { 23 | auto px_coords = px_coords_row + x; 24 | auto* output_pixels = output_pixels_row + x; 25 | 26 | interpolate::bilinear::avx2::interpolate( 27 | input_image_, reinterpret_cast(px_coords), 28 | reinterpret_cast(output_pixels)); 29 | } 30 | } 31 | } 32 | 33 | private: 34 | static constexpr auto step = 4; 35 | 36 | const interpolate::BGRImage input_image_; 37 | const cv::Mat2f coords_; 38 | cv::Mat3b& output_image_; 39 | }; 40 | 41 | cv::Mat3b bilinear_avx2_multi_thread(const BenchmarkInput& input) { 42 | auto output_image = cv::Mat3b(input.output_size); 43 | auto parallel_executor = 44 | InterpolateAVX2MultiThread(input.source_image, input.coords, output_image); 45 | 46 | cv::parallel_for_(cv::Range(0, output_image.rows), parallel_executor); 47 | 48 | return output_image; 49 | } 50 | 51 | static void BM_bilinear_avx2_multi_thread(benchmark::State& state, const BenchmarkInput& input) { 52 | for (auto _ : state) { 53 | bilinear_avx2_multi_thread(input); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_avx512_multi_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_avx512.hpp" 5 | 6 | class InterpolateAVX512MultiThread : public cv::ParallelLoopBody 7 | { 8 | public: 9 | InterpolateAVX512MultiThread(const interpolate::BGRImage& input_image, const cv::Mat2f& coords, 10 | cv::Mat3b& output_image) 11 | : input_image_(input_image), coords_(coords), output_image_(output_image) { 12 | if (output_image.cols % step != 0) { 13 | throw std::runtime_error("output frame width must be multiple of 4"); 14 | } 15 | } 16 | 17 | virtual void operator()(const cv::Range& range) const override { 18 | for (auto y = range.start; y < range.end; y++) { 19 | auto* px_coords_row = coords_.ptr(y); 20 | auto* output_pixels_row = output_image_.ptr(y); 21 | 22 | for (auto x = 0; x < output_image_.cols; x += step) { 23 | auto px_coords = px_coords_row + x; 24 | auto* output_pixels = output_pixels_row + x; 25 | 26 | interpolate::bilinear::avx512::interpolate( 27 | input_image_, reinterpret_cast(px_coords), 28 | reinterpret_cast(output_pixels)); 29 | } 30 | } 31 | } 32 | 33 | private: 34 | static constexpr auto step = 8; 35 | 36 | const interpolate::BGRImage input_image_; 37 | const cv::Mat2f coords_; 38 | cv::Mat3b& output_image_; 39 | }; 40 | 41 | cv::Mat3b bilinear_avx512_multi_thread(const BenchmarkInput& input) { 42 | auto output_image = cv::Mat3b(input.output_size); 43 | auto parallel_executor = 44 | InterpolateAVX512MultiThread(input.source_image, input.coords, output_image); 45 | 46 | cv::parallel_for_(cv::Range(0, output_image.rows), parallel_executor); 47 | 48 | return output_image; 49 | } 50 | 51 | static void BM_bilinear_avx512_multi_thread(benchmark::State& state, const BenchmarkInput& input) { 52 | for (auto _ : state) { 53 | bilinear_avx512_multi_thread(input); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/benchmark/bilinear_sse4_multi_thread.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | #include "interpolate/bilinear_sse4.hpp" 5 | 6 | class InterpolateSSE4MultiThread : public cv::ParallelLoopBody 7 | { 8 | public: 9 | InterpolateSSE4MultiThread(const interpolate::BGRImage& input_image, const cv::Mat2f& coords, 10 | cv::Mat3b& output_image) 11 | : input_image_(input_image), coords_(coords), output_image_(output_image) {} 12 | 13 | virtual void operator()(const cv::Range& range) const override { 14 | auto* last_output_pixel = output_image_.ptr(range.end - 1, output_image_.cols - 1); 15 | 16 | for (auto y = range.start; y < range.end; y++) { 17 | const auto* px_coords_row = coords_.ptr(y); 18 | auto* output_px_row = output_image_.ptr(y); 19 | 20 | for (auto x = 0; x < output_image_.cols; x += 2) { 21 | const auto* px_coords = px_coords_row + x; 22 | auto* output_pixels = output_px_row + x; 23 | 24 | auto is_last_output_pixel = (output_pixels + 1 == last_output_pixel); 25 | 26 | interpolate::bilinear::sse4::interpolate( 27 | input_image_, reinterpret_cast(px_coords), 28 | reinterpret_cast(output_pixels), !is_last_output_pixel); 29 | } 30 | } 31 | } 32 | 33 | private: 34 | const interpolate::BGRImage& input_image_; 35 | const cv::Mat2f coords_; 36 | cv::Mat3b& output_image_; 37 | }; 38 | 39 | cv::Mat3b bilinear_sse4_multi_thread(const BenchmarkInput& input) { 40 | auto output_image = cv::Mat3b(input.output_size); 41 | auto parallel_executor = 42 | InterpolateSSE4MultiThread(input.source_image, input.coords, output_image); 43 | 44 | cv::parallel_for_(cv::Range(0, output_image.rows), parallel_executor); 45 | 46 | return output_image; 47 | } 48 | 49 | static void BM_bilinear_sse4_multi_thread(benchmark::State& state, const BenchmarkInput& input) { 50 | for (auto _ : state) { 51 | bilinear_sse4_multi_thread(input); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -2 4 | AlignAfterOpenBracket: true 5 | AlignConsecutiveAssignments: false 6 | AlignEscapedNewlinesLeft: true 7 | AlignOperands: false 8 | AlignTrailingComments: true 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: Inline 13 | AllowShortIfStatementsOnASingleLine: false 14 | AllowShortLoopsOnASingleLine: false 15 | AlwaysBreakBeforeMultilineStrings: false 16 | AlwaysBreakTemplateDeclarations: true 17 | BinPackArguments: true 18 | BinPackParameters: true 19 | BreakBeforeBinaryOperators: None 20 | BreakBeforeBraces: Custom 21 | BraceWrapping: 22 | AfterClass: true 23 | AfterControlStatement: false 24 | AfterEnum: false 25 | AfterFunction: false 26 | AfterNamespace: true 27 | AfterStruct: false 28 | AfterUnion: false 29 | BeforeCatch: false 30 | BeforeElse: false 31 | IndentBraces: false 32 | BreakBeforeTernaryOperators: false 33 | BreakConstructorInitializersBeforeComma: false 34 | ColumnLimit: 100 35 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 36 | ConstructorInitializerIndentWidth: 4 37 | ContinuationIndentWidth: 4 38 | Cpp11BracedListStyle: true 39 | DerivePointerAlignment: false 40 | IndentCaseLabels: true 41 | IndentWidth: 2 42 | IndentWrappedFunctionNames: true 43 | KeepEmptyLinesAtTheStartOfBlocks: true 44 | MaxEmptyLinesToKeep: 1 45 | NamespaceIndentation: Inner 46 | PenaltyBreakBeforeFirstCallParameter: 20 47 | PenaltyBreakComment: 0 48 | PenaltyBreakFirstLessLess: 0 49 | PenaltyBreakString: 1000 50 | PenaltyExcessCharacter: 1000000 51 | PenaltyReturnTypeOnItsOwnLine: 100 52 | PointerAlignment: Left 53 | SortIncludes: false 54 | SpaceAfterCStyleCast: true 55 | SpaceBeforeAssignmentOperators: true 56 | SpaceBeforeParens: ControlStatements 57 | SpaceInEmptyParentheses: false 58 | SpacesBeforeTrailingComments: 4 59 | SpacesInAngles: false 60 | SpacesInContainerLiterals: true 61 | SpacesInCStyleCastParentheses: false 62 | SpacesInParentheses: false 63 | SpacesInSquareBrackets: false 64 | Standard: Cpp11 65 | TabWidth: 2 66 | UseTab: Never 67 | -------------------------------------------------------------------------------- /src/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "benchmark/benchmark.h" 10 | 11 | #include "interpolate/types.hpp" 12 | 13 | struct BenchmarkInput { 14 | cv::Mat3b source_image_mat; 15 | interpolate::BGRImage source_image; 16 | cv::Mat2f coords; 17 | cv::Size2i output_size; 18 | }; 19 | 20 | static cv::Mat2f sampling_coordinates(cv::Size2i output_size, cv::Size2i input_size) { 21 | auto coords = cv::Mat2f(output_size); 22 | 23 | for (auto y = 0; y < output_size.height; y++) { 24 | for (auto x = 0; x < output_size.width; x++) { 25 | auto x_sample = (float(x) / float(output_size.width)) * input_size.width; 26 | auto y_sample = float(y) / float(output_size.height) * input_size.height; 27 | 28 | coords(y, x) = {y_sample, x_sample}; 29 | } 30 | } 31 | 32 | // Rotate a bit 33 | auto angle = 3.14f / 10.0f; 34 | auto warp = cv::Mat1f(2, 3); 35 | warp(0, 0) = cos(angle); 36 | warp(0, 1) = -sin(angle); 37 | warp(0, 2) = 100.0f; 38 | warp(1, 0) = sin(angle); 39 | warp(1, 1) = cos(angle); 40 | warp(1, 2) = -output_size.height / 2.0f; 41 | 42 | cv::warpAffine(coords, coords, warp, coords.size()); 43 | 44 | // This sampling coordinate will be clamped when fetching the pixel data for the next row, which 45 | // is out of bounds. 46 | coords(0, 0) = {float(input_size.height - 1), float(input_size.width - 1)}; 47 | 48 | return coords; 49 | } 50 | 51 | bool mats_equivalent(const cv::Mat3b& a, const cv::Mat3b& b) { 52 | if ((a.rows != b.rows) || (a.cols != b.cols)) { 53 | std::cout << "mats different size\n"; 54 | return false; 55 | } 56 | 57 | for (auto y = 0; y < a.rows; y++) { 58 | for (auto x = 0; x < a.cols; x++) { 59 | cv::Vec3b px1 = a(y, x); 60 | cv::Vec3b px2 = b(y, x); 61 | 62 | auto b_diff = std::abs(px1[0] - px2[0]); 63 | auto g_diff = std::abs(px1[1] - px2[1]); 64 | auto r_diff = std::abs(px1[2] - px2[2]); 65 | 66 | // TODO: there are some very slight differences with the SIMD interpolations, 67 | // probably due to rounding. 68 | if (b_diff > 3 || g_diff > 3 || r_diff > 3) { 69 | std::cout << "pixels not equal at " << x << "x" << y << "\n"; 70 | std::cout << int32_t(px1[0]) << " " << int32_t(px1[1]) << " " << int32_t(px1[2]) << "\n"; 71 | std::cout << int32_t(px2[0]) << " " << int32_t(px2[1]) << " " << int32_t(px2[2]) << "\n"; 72 | return false; 73 | } 74 | } 75 | } 76 | 77 | return true; 78 | } 79 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Bilinear image filter using SSE, AVX2 and AVX512 2 | 3 | Demonstrates bilinear image filtering using SIMD (SSE, AVX2 and AVX512) and multithreading. 4 | 5 | Requires a CPU with AVX2 support. AVX512 will be used if it is available. 6 | 7 | ## Dependencies 8 | 9 | * C++17 (GCC >= 8) 10 | * OpenCV 4 11 | * CMake 12 | 13 | ## Build and run 14 | 15 | ``` 16 | git submodule update --init --recursive --jobs 4 17 | mkdir -p build 18 | cd build 19 | cmake .. 20 | make -j$(nproc) 21 | ./bilinear_filter_simd 22 | ``` 23 | 24 | Displays benchmark numbers for different algorithms. Multithreaded AVX512 is the fastest. 25 | 26 | ## Benchmark results 27 | 28 | ``` 29 | 2020-08-09 11:34:16 30 | Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz 31 | Input image size: 3840x2160 32 | Output image size: 1280x720 33 | OpenCV: numberOfCPUS=16 getNumThreads=16 34 | Running ./bilinear_filter_simd 35 | Run on (16 X 3400.97 MHz CPU s) 36 | CPU Caches: 37 | L1 Data 32 KiB (x8) 38 | L1 Instruction 32 KiB (x8) 39 | L2 Unified 1024 KiB (x8) 40 | L3 Unified 25344 KiB (x1) 41 | Load Average: 0.00, 0.04, 0.13 42 | --------------------------------------------------------------------------------- 43 | Benchmark Time CPU Iterations 44 | --------------------------------------------------------------------------------- 45 | No SIMD - single thread/min_time:2.000 13.3 ms 13.3 ms 215 46 | No SIMD - multi thread/min_time:2.000 1.25 ms 1.25 ms 2227 47 | SSE4 - single thread/min_time:2.000 10.7 ms 10.7 ms 267 48 | SSE4 - multi thread/min_time:2.000 0.987 ms 0.986 ms 2856 49 | AVX2 - single thread/min_time:2.000 8.14 ms 8.14 ms 360 50 | AVX2 - multi thread/min_time:2.000 0.868 ms 0.868 ms 3253 51 | AVX512 - single thread/min_time:2.000 7.33 ms 7.33 ms 373 52 | AVX512 - multi thread/min_time:2.000 0.822 ms 0.822 ms 3430 53 | ``` 54 | 55 | ``` 56 | 2020-08-09 23:39:59 57 | Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz 58 | Running ./bilinear_filter_simd 59 | Run on (12 X 4600 MHz CPU s) 60 | CPU Caches: 61 | L1 Data 32 KiB (x6) 62 | L1 Instruction 32 KiB (x6) 63 | L2 Unified 256 KiB (x6) 64 | L3 Unified 12288 KiB (x1) 65 | Load Average: 0.05, 0.06, 0.01 66 | --------------------------------------------------------------------------------- 67 | Benchmark Time CPU Iterations 68 | --------------------------------------------------------------------------------- 69 | No SIMD - single thread/min_time:2.000 7.64 ms 7.64 ms 367 70 | No SIMD - multi thread/min_time:2.000 1.69 ms 1.69 ms 1675 71 | SSE4 - single thread/min_time:2.000 5.26 ms 5.26 ms 532 72 | SSE4 - multi thread/min_time:2.000 1.39 ms 1.39 ms 2010 73 | AVX2 - single thread/min_time:2.000 4.26 ms 4.26 ms 658 74 | AVX2 - multi thread/min_time:2.000 1.34 ms 1.34 ms 2082 75 | ``` 76 | -------------------------------------------------------------------------------- /src/interpolate/bilinear_plain.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "interpolate/types.hpp" 4 | 5 | namespace interpolate::bilinear::plain 6 | { 7 | 8 | static inline interpolate::BGRPixel interpolate(const interpolate::BGRImage& image, 9 | const interpolate::InputCoords& input_coords) { 10 | auto px = int(input_coords.x); // floor x 11 | auto py = int(input_coords.y); // floor y 12 | 13 | // Four neighbouring pixels 14 | const auto* pixel = image.ptr(py, px); 15 | 16 | const auto& p1 = pixel[0]; 17 | const auto& p2 = pixel[1]; 18 | const auto* pixel_below = image.ptr_below(pixel); 19 | const auto& p3 = pixel_below[0]; 20 | const auto& p4 = pixel_below[1]; 21 | 22 | // Calculate the weights for each pixel 23 | float fx = input_coords.x - px; 24 | float fy = input_coords.y - py; 25 | float fx1 = 1.0f - fx; 26 | float fy1 = 1.0f - fy; 27 | 28 | // Using int for the weights is a bit faster than using floats 29 | int w1 = fx1 * fy1 * 256.0f; 30 | int w2 = fx * fy1 * 256.0f; 31 | int w3 = fx1 * fy * 256.0f; 32 | int w4 = fx * fy * 256.0f; 33 | 34 | // Calculate the weighted sum of pixels (for each color channel) 35 | int outr = p1.r * w1 + p2.r * w2 + p3.r * w3 + p4.r * w4; 36 | int outg = p1.g * w1 + p2.g * w2 + p3.g * w3 + p4.g * w4; 37 | int outb = p1.b * w1 + p2.b * w2 + p3.b * w3 + p4.b * w4; 38 | 39 | return {uint8_t(outb >> 8), uint8_t(outg >> 8), uint8_t(outr >> 8)}; 40 | } 41 | 42 | template 43 | static inline void interpolate_multiple(const interpolate::BGRImage& image, 44 | interpolate::BGRPixel* output, 45 | const interpolate::InputCoords* input_coords) { 46 | interpolate::BGRPixel output_pixels[N]; 47 | 48 | for (int i = 0; i < N; i++) { 49 | auto px = int(input_coords[i].x); // floor x 50 | auto py = int(input_coords[i].y); // floor y 51 | 52 | // Four neighbouring pixels 53 | const auto* pixel = image.ptr(py, px); 54 | const auto& p1 = pixel[0]; 55 | const auto& p2 = pixel[1]; 56 | const auto* pixel_below = image.ptr_below(pixel); 57 | const auto& p3 = pixel_below[0]; 58 | const auto& p4 = pixel_below[1]; 59 | 60 | // Calculate the weights for each pixel 61 | float fx = input_coords[i].x - px; 62 | float fy = input_coords[i].y - py; 63 | float fx1 = 1.0f - fx; 64 | float fy1 = 1.0f - fy; 65 | 66 | // Using int for the weights is a bit faster than using floats 67 | int w1 = fx1 * fy1 * 256.0f; 68 | int w2 = fx * fy1 * 256.0f; 69 | int w3 = fx1 * fy * 256.0f; 70 | int w4 = fx * fy * 256.0f; 71 | 72 | // Calculate the weighted sum of pixels (for each color channel) 73 | int outr = p1.r * w1 + p2.r * w2 + p3.r * w3 + p4.r * w4; 74 | int outg = p1.g * w1 + p2.g * w2 + p3.g * w3 + p4.g * w4; 75 | int outb = p1.b * w1 + p2.b * w2 + p3.b * w3 + p4.b * w4; 76 | 77 | output_pixels[i] = {uint8_t(outb >> 8), uint8_t(outg >> 8), uint8_t(outr >> 8)}; 78 | } 79 | 80 | memcpy(output, output_pixels, sizeof(interpolate::BGRPixel) * N); 81 | } 82 | 83 | } // namespace interpolate::bilinear::plain 84 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | #include "benchmark/bilinear_plain_single_thread.hpp" 4 | #include "benchmark/bilinear_plain_multi_thread.hpp" 5 | #include "benchmark/bilinear_sse4_single_thread.hpp" 6 | #include "benchmark/bilinear_sse4_multi_thread.hpp" 7 | #include "benchmark/bilinear_avx2_single_thread.hpp" 8 | #include "benchmark/bilinear_avx2_multi_thread.hpp" 9 | 10 | #ifdef __AVX512F__ 11 | #include "benchmark/bilinear_avx512_single_thread.hpp" 12 | #include "benchmark/bilinear_avx512_multi_thread.hpp" 13 | #endif 14 | 15 | BenchmarkInput create_benchmark_input() { 16 | auto benchmark_input = BenchmarkInput(); 17 | 18 | auto source_image = cv::imread("../assets/155603.jpg"); 19 | benchmark_input.source_image_mat = source_image; 20 | 21 | benchmark_input.source_image = interpolate::BGRImage( 22 | source_image.rows, source_image.cols, source_image.step, 23 | reinterpret_cast(source_image.ptr(0, 0))); 24 | 25 | benchmark_input.output_size = cv::Size2i(1280, 720); 26 | benchmark_input.coords = 27 | sampling_coordinates(benchmark_input.output_size, benchmark_input.source_image_mat.size()); 28 | 29 | return benchmark_input; 30 | } 31 | 32 | void compare_mats(const cv::Mat3b& gold_standard, std::string name, const cv::Mat3b& comparison) { 33 | if (!mats_equivalent(gold_standard, comparison)) { 34 | std::cout << name << " output image not the same\n"; 35 | std::exit(1); 36 | } 37 | } 38 | 39 | void validate_implementations(BenchmarkInput& benchmark_input) { 40 | auto gold_standard = bilinear_plain_single_thread(benchmark_input); 41 | 42 | compare_mats(gold_standard, "plain multi thread", bilinear_plain_multi_thread(benchmark_input)); 43 | compare_mats(gold_standard, "sse4 single thread", bilinear_sse4_single_thread(benchmark_input)); 44 | compare_mats(gold_standard, "sse4 multi thread", bilinear_sse4_multi_thread(benchmark_input)); 45 | compare_mats(gold_standard, "avx2 single thread", bilinear_avx2_single_thread(benchmark_input)); 46 | compare_mats(gold_standard, "avx2 multi thread", bilinear_avx2_multi_thread(benchmark_input)); 47 | 48 | #ifdef __AVX512F__ 49 | compare_mats(gold_standard, "avx512 single thread", 50 | bilinear_avx512_single_thread(benchmark_input)); 51 | compare_mats(gold_standard, "avx512 multi thread", bilinear_avx512_multi_thread(benchmark_input)); 52 | #endif 53 | } 54 | 55 | void register_benchmarks(BenchmarkInput& benchmark_input) { 56 | auto benchmarks = std::vector(); 57 | 58 | benchmarks.push_back(benchmark::RegisterBenchmark( 59 | "No SIMD - single thread", BM_bilinear_plain_single_thread, benchmark_input)); 60 | benchmarks.push_back(benchmark::RegisterBenchmark( 61 | "No SIMD - multi thread", BM_bilinear_plain_multi_thread, benchmark_input)); 62 | 63 | benchmarks.push_back(benchmark::RegisterBenchmark( 64 | "SSE4 - single thread", BM_bilinear_sse4_single_thread, benchmark_input)); 65 | benchmarks.push_back(benchmark::RegisterBenchmark( 66 | "SSE4 - multi thread", BM_bilinear_sse4_multi_thread, benchmark_input)); 67 | 68 | benchmarks.push_back(benchmark::RegisterBenchmark( 69 | "AVX2 - single thread", BM_bilinear_avx2_single_thread, benchmark_input)); 70 | benchmarks.push_back(benchmark::RegisterBenchmark( 71 | "AVX2 - multi thread", BM_bilinear_avx2_multi_thread, benchmark_input)); 72 | 73 | #ifdef __AVX512F__ 74 | benchmarks.push_back(benchmark::RegisterBenchmark( 75 | "AVX512 - single thread", BM_bilinear_avx512_single_thread, benchmark_input)); 76 | benchmarks.push_back(benchmark::RegisterBenchmark( 77 | "AVX512 - multi thread", BM_bilinear_avx512_multi_thread, benchmark_input)); 78 | #endif 79 | 80 | for (auto bm : benchmarks) { 81 | bm->Unit(benchmark::kMillisecond); 82 | bm->MinTime(2.0); 83 | } 84 | } 85 | 86 | int main(int argc, char** argv) { 87 | auto benchmark_input = create_benchmark_input(); 88 | 89 | validate_implementations(benchmark_input); 90 | register_benchmarks(benchmark_input); 91 | 92 | printf("Input image size: %dx%d\n", benchmark_input.source_image.cols, 93 | benchmark_input.source_image.rows); 94 | printf("Output image size: %dx%d\n", benchmark_input.output_size.width, 95 | benchmark_input.output_size.height); 96 | printf("OpenCV: numberOfCPUS=%d getNumThreads=%d\n", cv::getNumberOfCPUs(), cv::getNumThreads()); 97 | 98 | benchmark::Initialize(&argc, argv); 99 | benchmark::RunSpecifiedBenchmarks(); 100 | } 101 | -------------------------------------------------------------------------------- /src/interpolate/bilinear_sse4.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "interpolate/types.hpp" 5 | 6 | namespace interpolate::bilinear::sse4 7 | { 8 | 9 | // SSE bilinear interpolation implementation based on 10 | // http://fastcpp.blogspot.com/2011/06/bilinear-pixel-interpolation-using-sse.html 11 | // and altered to work with BGR24 image format. 12 | 13 | static const __m128i WEIGHTS_Y_SHUFFLE = 14 | _mm_set_epi8(11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0); 15 | 16 | // Calculate the interpolation weights for 2 pixels. 17 | // Returns weights as 16 bit ints. 18 | // (px2) w4 w3 w2 w1 (px1) w4 w3 w2 w1 19 | static inline __m128i calc_weights(const float sample_coords[4]) { 20 | const __m128 initial = _mm_castsi128_ps(_mm_stream_load_si128((__m128i*) sample_coords)); 21 | 22 | const __m128 floored = _mm_floor_ps(initial); 23 | const __m128 fractional = _mm_sub_ps(initial, floored); 24 | 25 | // Convert fractional parts to 32 bit ints in range 0-256 26 | // x2 y2 x1 y1 27 | __m128i lower = _mm_cvtps_epi32(_mm_mul_ps(fractional, _mm_set1_ps(256.0f))); 28 | 29 | // Convert to 16 bit ints 30 | // 0 0 0 0 x2 y2 x1 y1 31 | lower = _mm_packs_epi32(lower, _mm_set1_epi32(0)); 32 | 33 | // Get the 1-fractional from the 16 bit result 34 | // 256 256 256 256 1-x2 1-y2 1-x1 1-y1 35 | const __m128i upper = _mm_sub_epi16(_mm_set1_epi16(256), lower); 36 | 37 | // Combine so we have all the parts in one value to shuffle 38 | // x2 (1-x2) y2 (1-y2) x1 (1-x1) y1 (1-y1) 39 | const __m128i combined = _mm_unpacklo_epi16(upper, lower); 40 | 41 | // x2 (1-x2) x2 (1-x2) x1 (1-x1) x1 (1-x1) 42 | const __m128i weights_x = _mm_shuffle_epi32(combined, _MM_SHUFFLE(3, 3, 1, 1)); 43 | 44 | // y2 y2 (1-y2) (1-y2) y1 y1 (1-y1) (1-y1) 45 | // Shuffle 16 bit numbers as 8 bits because there is no _mm256_shuffle_epi16 46 | __m128i weights_y = _mm_shuffle_epi8(combined, WEIGHTS_Y_SHUFFLE); 47 | 48 | // Multiply to get per pixel weights. Divide by 256 to get back into correct range. 49 | __m128i weights = _mm_srli_epi16(_mm_mullo_epi16(weights_x, weights_y), 8); 50 | 51 | // If both weights were 256, the result is 65536 which is all 0s in the lower 16 bits. 52 | // Find the weights this happened to, and replace them with 256. 53 | __m128i weights_hi = _mm_mulhi_epi16(weights_x, weights_y); 54 | __m128i weights_hi_mask = _mm_cmpgt_epi16(weights_hi, _mm_setzero_si128()); 55 | weights = _mm_blendv_epi8(weights, _mm_set1_epi16(256), weights_hi_mask); 56 | 57 | return weights; 58 | } 59 | 60 | static inline __m128i interpolate_one_pixel(const interpolate::BGRImage& image, 61 | const interpolate::InputCoords& input_coords, 62 | __m128i w12, __m128i w34) { 63 | 64 | const auto* p0 = image.ptr(input_coords.y, input_coords.x); 65 | 66 | // Load 4 pixels for interpolation. 67 | // We are only using the lower 48 bits of each load. 68 | // Shuffle 24bpp around to use 64bpp with 16bpc 69 | // _ r g b _ r g b 70 | __m128i p12 = 71 | _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*) p0), 72 | _mm_set_epi8(-1, -1, -1, 5, -1, 4, -1, 3, -1, -1, -1, 2, -1, 1, -1, 0)); 73 | 74 | // _ r g b _ r g b 75 | __m128i p34 = 76 | _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*) image.ptr_below(p0)), 77 | _mm_set_epi8(-1, -1, -1, 5, -1, 4, -1, 3, -1, -1, -1, 2, -1, 1, -1, 0)); 78 | 79 | // Multiply each pixel with its weight 80 | const __m128i out_12 = _mm_mullo_epi16(p12, w12); 81 | const __m128i out_34 = _mm_mullo_epi16(p34, w34); 82 | 83 | // Sum the results 84 | __m128i out_1234 = _mm_add_epi16(out_12, out_34); 85 | __m128i out_high = _mm_shuffle_epi32(out_1234, _MM_SHUFFLE(3, 2, 3, 2)); 86 | __m128i out = _mm_add_epi16(out_1234, out_high); 87 | 88 | // Divide by 256 89 | out = _mm_srli_epi16(out, 8); 90 | 91 | // Convert to 8bpc 92 | out = _mm_packus_epi16(out, _mm_setzero_si128()); 93 | 94 | return out; 95 | } 96 | 97 | static inline void write_output_pixels(__m128i pixel_1, __m128i pixel_2, 98 | interpolate::BGRPixel output_pixels[2], 99 | bool can_write_third_pixel) { 100 | // _ _ 2 1 101 | __m128i combined = _mm_unpacklo_epi32(pixel_1, pixel_2); 102 | 103 | // Shuffle to pack 2 pixels into lower 48 bits 104 | combined = _mm_shuffle_epi8(combined, 105 | _mm_set_epi8( 106 | // Upper 80 bits not used 107 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 108 | // Packed pixel data 109 | 6, 5, 4, 2, 1, 0)); 110 | 111 | // Write the pixel data. Faster to write 8 bytes when allowed. 112 | uint64_t interpolated_pixels = _mm_cvtsi128_si64(combined); 113 | memcpy(output_pixels, &interpolated_pixels, can_write_third_pixel ? 8 : 6); 114 | } 115 | 116 | static inline void interpolate(const interpolate::BGRImage& image, 117 | const interpolate::InputCoords input_coords[2], 118 | interpolate::BGRPixel output_pixels[2], bool can_write_third_pixel) { 119 | 120 | // Calculate the weights for 2 pixels 121 | __m128i weights = calc_weights(&input_coords[0].y); 122 | 123 | // Prepare weights for pixel 1 124 | __m128i pixel1_w12 = _mm_shufflelo_epi16(weights, _MM_SHUFFLE(1, 1, 0, 0)); 125 | __m128i pixel1_w34 = _mm_shufflelo_epi16(weights, _MM_SHUFFLE(3, 3, 2, 2)); 126 | // w2 w2 w2 w2 w1 w1 w1 w1 127 | pixel1_w12 = _mm_unpacklo_epi16(pixel1_w12, pixel1_w12); 128 | // w4 w4 w4 w4 w3 w3 w3 w3 129 | pixel1_w34 = _mm_unpacklo_epi16(pixel1_w34, pixel1_w34); 130 | 131 | // Prepare weights for pixel 2 132 | __m128i pixel2_w12 = _mm_shufflehi_epi16(weights, _MM_SHUFFLE(1, 1, 0, 0)); 133 | __m128i pixel2_w34 = _mm_shufflehi_epi16(weights, _MM_SHUFFLE(3, 3, 2, 2)); 134 | // w2 w2 w2 w2 w1 w1 w1 w1 135 | pixel2_w12 = _mm_unpackhi_epi16(pixel2_w12, pixel2_w12); 136 | // w4 w4 w4 w4 w3 w3 w3 w3 137 | pixel2_w34 = _mm_unpackhi_epi16(pixel2_w34, pixel2_w34); 138 | 139 | const __m128i pixel_1 = interpolate_one_pixel(image, input_coords[0], pixel1_w12, pixel1_w34); 140 | const __m128i pixel_2 = interpolate_one_pixel(image, input_coords[1], pixel2_w12, pixel2_w34); 141 | 142 | write_output_pixels(pixel_1, pixel_2, output_pixels, can_write_third_pixel); 143 | } 144 | 145 | } // namespace interpolate::bilinear::sse4 146 | -------------------------------------------------------------------------------- /src/interpolate/bilinear_avx2.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "interpolate/types.hpp" 4 | #include 5 | 6 | namespace interpolate::bilinear::avx2 7 | { 8 | 9 | static const __m256i WEIGHTS_Y_SHUFFLE = 10 | _mm256_set_epi8(11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0, 11 | // Repeated 12 | 11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0); 13 | 14 | // Calculate the weights for the 4 surrounding pixels of 4 independent xy pairs. 15 | // Returns weights as 16 bit ints. 16 | // Eg: w4 w3 w2 w1 (x4/y4) w4 w3 w2 w1 (x3/y3) | w4 w3 w2 w1 (x2/y2) w4 w3 w2 w1 (x1/y1) 17 | static inline __m256i calculate_weights(const float sample_coords[8]) { 18 | const __m256 initial = 19 | _mm256_castsi256_ps(_mm256_stream_load_si256((const __m256i*) sample_coords)); 20 | 21 | const __m256 floored = _mm256_floor_ps(initial); 22 | const __m256 fractional = _mm256_sub_ps(initial, floored); 23 | 24 | // Convert fractional parts to 32 bit ints in range 0-256 25 | // x4 y4 x3 y3 | x2 y2 x1 y1 26 | __m256i lower = _mm256_cvtps_epi32(_mm256_mul_ps(fractional, _mm256_set1_ps(256.0f))); 27 | 28 | // Convert to 16 bit ints 29 | // 0 0 0 0 x4 y4 x3 y3 | 0 0 0 0 x2 y2 x1 y1 30 | lower = _mm256_packs_epi32(lower, _mm256_set1_epi32(0)); 31 | 32 | // Get the 1-fractional from the 16 bit result 33 | // 256 256 256 256 1-x4 1-y4 1-x3 1-y3 | 256 256 256 256 1-x2 1-y2 1-x1 1-y1 34 | const __m256i upper = _mm256_sub_epi16(_mm256_set1_epi16(256), lower); 35 | 36 | // ...y4 ...y3 | ...y2 1-x1 x1 1-y1 y1 37 | const __m256i combined = _mm256_unpacklo_epi16(upper, lower); 38 | 39 | // ...(1-x4) ...(1-x3) | ...(1-x2) x1 (1-x1) x1 (1-x1) 40 | const __m256i weights_x = _mm256_shuffle_epi32(combined, _MM_SHUFFLE(3, 3, 1, 1)); 41 | 42 | // ...(1-y4) ...(1-y3) | ...(1-y2) y1 y1 (1-y1) (1-y1) 43 | // Shuffle 16 bit numbers as 8 bits because there is no _mm256_shuffle_epi16 44 | const __m256i weights_y = _mm256_shuffle_epi8(combined, WEIGHTS_Y_SHUFFLE); 45 | 46 | // Multiply to get final per pixel weights. Divide by 256 to get back into correct range. 47 | // ...(x4/y4) ... (x3/y3) | ... (x2/y2) w4 w3 w2 w1 (x1/y1) 48 | __m256i weights = _mm256_srli_epi16(_mm256_mullo_epi16(weights_x, weights_y), 8); 49 | 50 | // If both weights were 256, the result is 65536 which is all 0s in the lower 16 bits. 51 | // Find the weights this happened to, and replace them with 256. 52 | const __m256i weights_hi = _mm256_mulhi_epi16(weights_x, weights_y); 53 | const __m256i weights_hi_mask = _mm256_cmpgt_epi16(weights_hi, _mm256_setzero_si256()); 54 | weights = _mm256_blendv_epi8(weights, _mm256_set1_epi16(256), weights_hi_mask); 55 | 56 | return weights; 57 | } 58 | 59 | // Mask to shuffle the blue and green channels from packed 24bpp to 64bpp (16bpc) in each lane. 60 | // Upper and lower lanes of input should contain independent sets of 4 pixels. 61 | // Eg: 62 | // (12 other bits) rgb rgb (12 other bits) rgb rgb | (12 other bits) rgb rgb (12 other bits) rgb 63 | // rgb Becomes g g g g b b b b | g g g g b b b b 64 | static const __m128i MASK_SHUFFLE_BG_HALF = _mm_set_epi8( 65 | // green 66 | -1, 12, -1, 9, -1, 4, -1, 1, 67 | // blue 68 | -1, 11, -1, 8, -1, 3, -1, 0); 69 | 70 | static const __m256i MASK_SHUFFLE_BG = _mm256_set_m128i(MASK_SHUFFLE_BG_HALF, MASK_SHUFFLE_BG_HALF); 71 | 72 | // Do the same with the red channel. The upper half of each lane is not used. 73 | static const __m128i MASK_SHUFFLE_R0_HALF = _mm_set_epi8( 74 | // unused 75 | -1, -1, -1, -1, -1, -1, -1, -1, 76 | // red 77 | -1, 13, -1, 10, -1, 5, -1, 2); 78 | 79 | static const __m256i MASK_SHUFFLE_R0 = _mm256_set_m128i(MASK_SHUFFLE_R0_HALF, MASK_SHUFFLE_R0_HALF); 80 | 81 | static inline __m256i interpolate_two_pixels(const interpolate::BGRImage& image, 82 | const interpolate::InputCoords input_coords[3], 83 | __m256i weights) { 84 | // Load pixel data 85 | const auto* p0_0 = image.ptr(input_coords[0].y, input_coords[0].x); 86 | const auto* p1_0 = image.ptr(input_coords[2].y, input_coords[2].x); 87 | 88 | const __m256i pixels = _mm256_set_epi64x(*((int64_t*) image.ptr_below(p1_0)), *((int64_t*) p1_0), 89 | *((int64_t*) image.ptr_below(p0_0)), *((int64_t*) p0_0)); 90 | 91 | const __m256i pixels_bg = _mm256_shuffle_epi8(pixels, MASK_SHUFFLE_BG); 92 | const __m256i pixels_r0 = _mm256_shuffle_epi8(pixels, MASK_SHUFFLE_R0); 93 | 94 | // Multiply with the pixel data and sum adjacent pairs to 32 bit ints 95 | // g g b b | g g b b 96 | const __m256i result_bg = _mm256_madd_epi16(pixels_bg, weights); 97 | // _ _ r r | _ _ r r 98 | const __m256i result_r0 = _mm256_madd_epi16(pixels_r0, weights); 99 | 100 | // Add adjacent pairs again. 32 bpc. 101 | // _ r g b | _ r g b 102 | __m256i result = _mm256_hadd_epi32(result_bg, result_r0); 103 | 104 | // Divide by 256 to get back into correct range. 105 | result = _mm256_srli_epi32(result, 8); 106 | 107 | // Convert from 32bpc => 16bpc => 8bpc 108 | result = _mm256_packus_epi32(result, _mm256_setzero_si256()); 109 | result = _mm256_packus_epi16(result, _mm256_setzero_si256()); 110 | 111 | return result; 112 | } 113 | 114 | // Slightly faster than memcpy 115 | static inline void memcpy_12(uint8_t* dst, const uint8_t* src) { 116 | *((uint64_t*) dst) = *((uint64_t*) src); 117 | *((uint32_t*) (dst + 8)) = *((uint32_t*) (src + 8)); 118 | } 119 | 120 | static inline void write_output_pixels(__m256i pixels_13, __m256i pixels_24, 121 | interpolate::BGRPixel output_pixels[4]) { 122 | // Unpack to get adjacent pixel data in lower 64 bits of each lane 123 | // _ _ 4 3 | _ _ 2 1 124 | __m256i combined = _mm256_unpacklo_epi32(pixels_13, pixels_24); 125 | 126 | // Then pack everything into 1 lane 127 | // _ _ _ _ | 4 3 2 1 128 | combined = _mm256_permute4x64_epi64(combined, _MM_SHUFFLE(3, 3, 2, 0)); 129 | 130 | // Shuffle around to get packed 24bpp at the bottom of the lower lane 131 | combined = _mm256_shuffle_epi8(combined, 132 | _mm256_set_epi8( 133 | // Top lane not used 134 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 135 | // Bottom lane - 32 unused bits at the top 136 | -1, -1, -1, -1, 137 | // Packed pixel data 138 | 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0)); 139 | 140 | // Write out the lower 12 bytes 141 | alignas(32) uint8_t interpolated_pixels[32]; 142 | _mm256_store_si256((__m256i*) interpolated_pixels, combined); 143 | memcpy_12((uint8_t*) output_pixels, interpolated_pixels); 144 | } 145 | 146 | // Bilinear interpolation of 4 adjacent output pixels with the supplied coordinates using AVX2. 147 | static inline void interpolate(const interpolate::BGRImage& image, 148 | const interpolate::InputCoords input_coords[4], 149 | interpolate::BGRPixel output_pixels[4]) { 150 | // Calculate weights for 4 pixels 151 | const __m256i weights = calculate_weights(&input_coords[0].y); 152 | 153 | // Prepare weights for pixels 1 and 3, and interpolate 154 | const __m256i weights_13 = _mm256_unpacklo_epi64(weights, weights); 155 | const __m256i pixels_13 = interpolate_two_pixels(image, input_coords, weights_13); 156 | 157 | // Same for pixels 2 and 4 158 | const __m256i weights_24 = _mm256_unpackhi_epi64(weights, weights); 159 | const __m256i pixels_24 = interpolate_two_pixels(image, input_coords + 1, weights_24); 160 | 161 | write_output_pixels(pixels_13, pixels_24, output_pixels); 162 | } 163 | 164 | } // namespace interpolate::bilinear::avx2 165 | -------------------------------------------------------------------------------- /src/interpolate/bilinear_avx512.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "interpolate/types.hpp" 6 | 7 | namespace interpolate::bilinear::avx512 8 | { 9 | 10 | // 11 | // Weights 12 | // 13 | 14 | static const __m512i WEIGHTS_Y_SHUFFLE = 15 | _mm512_set_epi8(11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0, 16 | // Repeated 17 | 11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0, 18 | // Repeated 19 | 11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0, 20 | // Repeated 21 | 11, 10, 11, 10, 9, 8, 9, 8, 3, 2, 3, 2, 1, 0, 1, 0); 22 | 23 | static inline __m512i calculate_weights(const float sample_coords[16]) { 24 | const __m512 initial = _mm512_load_ps(sample_coords); 25 | 26 | const __m512 floored = _mm512_floor_ps(initial); 27 | const __m512 fractional = _mm512_sub_ps(initial, floored); 28 | 29 | // Convert fractional parts to 32 bit ints in range 0-256 30 | // ... | x2 y2 x1 y1 31 | __m512i lower = _mm512_cvtps_epi32(_mm512_mul_ps(fractional, _mm512_set1_ps(256.0f))); 32 | 33 | // Convert to 16 bit ints 34 | // ... | 0 0 0 0 x2 y2 x1 y1 35 | lower = _mm512_packs_epi32(lower, _mm512_set1_epi32(0)); 36 | 37 | // Subtract each value from 256 38 | // ... | 256 256 256 256 1-x2 1-y2 1-x1 1-y1 39 | const __m512i upper = _mm512_sub_epi16(_mm512_set1_epi16(256), lower); 40 | 41 | // Combine all the weights into a single vector 42 | // ... | 1-y2 1-x1 x1 1-y1 y1 43 | const __m512i combined = _mm512_unpacklo_epi16(upper, lower); 44 | 45 | // x weights 46 | // ... | ...(1-x2) x1 (1-x1) x1 (1-x1) 47 | __m512i weights_x = _mm512_shuffle_epi32(combined, _MM_PERM_DDBB); 48 | 49 | // y weights 50 | // ... | ...(1-y2) y1 y1 (1-y1) (1-y1) 51 | __m512i weights_y = _mm512_shuffle_epi8(combined, WEIGHTS_Y_SHUFFLE); 52 | 53 | // Multiply to get final per pixel weights. Divide by 256 to get back into correct range. 54 | // ... | ... (x2/y2) w4 w3 w2 w1 (x1/y1) 55 | __m512i weights = _mm512_mullo_epi16(weights_x, weights_y); 56 | weights = _mm512_srli_epi16(weights, 8); 57 | 58 | // If both weights were 256, the result is 65536 which is all 0s in the lower 16 bits. 59 | // Find the weights this happened to, and replace them with 256. 60 | __m512i weights_hi = _mm512_mulhi_epi16(weights_x, weights_y); 61 | __mmask32 weights_hi_mask = _mm512_cmpgt_epi16_mask(weights_hi, _mm512_setzero_si512()); 62 | weights = _mm512_mask_blend_epi16(weights_hi_mask, weights, _mm512_set1_epi16(256)); 63 | 64 | return weights; 65 | } 66 | 67 | // Masks to shuffle initial pixel data from packed 24bpp to 64bpp (16bpc) in each lane. 68 | // Eg, a 128 bit lane with the following data: 69 | // (16 other bits) rgb rgb (16 other bits) rgb rgb 70 | // Becomes: 71 | // g g g g b b b b 72 | 73 | // Blue and green channels. 74 | #define MASK_SHUFFLE_BG_SINGLE_LANE -1, 12, -1, 9, -1, 4, -1, 1, -1, 11, -1, 8, -1, 3, -1, 0 75 | static const __m512i MASK_SHUFFLE_BG = 76 | _mm512_set_epi8(MASK_SHUFFLE_BG_SINGLE_LANE, MASK_SHUFFLE_BG_SINGLE_LANE, 77 | MASK_SHUFFLE_BG_SINGLE_LANE, MASK_SHUFFLE_BG_SINGLE_LANE); 78 | 79 | // Red channel. The upper half of each 128 bit lane is not used. 80 | #define MASK_SHUFFLE_R0_SINGLE_LANE -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, -1, 10, -1, 5, -1, 2 81 | static const __m512i MASK_SHUFFLE_R0 = 82 | _mm512_set_epi8(MASK_SHUFFLE_R0_SINGLE_LANE, MASK_SHUFFLE_R0_SINGLE_LANE, 83 | MASK_SHUFFLE_R0_SINGLE_LANE, MASK_SHUFFLE_R0_SINGLE_LANE); 84 | 85 | // 86 | // Interpolation 87 | // 88 | 89 | static inline __m512i interpolate_four_pixels(const interpolate::BGRImage& image, 90 | const interpolate::InputCoords input_coords[7], 91 | __m512i weights) { 92 | // Load pixel data 93 | const auto* p1 = image.ptr(input_coords[0].y, input_coords[0].x); 94 | const auto* p2 = image.ptr(input_coords[2].y, input_coords[2].x); 95 | const auto* p3 = image.ptr(input_coords[4].y, input_coords[4].x); 96 | const auto* p4 = image.ptr(input_coords[6].y, input_coords[6].x); 97 | 98 | __m512i pixels = _mm512_set_epi64(*((int64_t*) image.ptr_below(p4)), *((int64_t*) p4), 99 | *((int64_t*) image.ptr_below(p3)), *((int64_t*) p3), 100 | *((int64_t*) image.ptr_below(p2)), *((int64_t*) p2), 101 | *((int64_t*) image.ptr_below(p1)), *((int64_t*) p1)); 102 | 103 | __m512i pixels_bg = _mm512_shuffle_epi8(pixels, MASK_SHUFFLE_BG); 104 | __m512i pixels_r0 = _mm512_shuffle_epi8(pixels, MASK_SHUFFLE_R0); 105 | 106 | // Multiply with the pixel data and sum adjacent pairs to 32 bit ints 107 | // ... | g g b b 108 | __m512i result_bg = _mm512_madd_epi16(pixels_bg, weights); 109 | // ... | _ g _ b 110 | result_bg = _mm512_add_epi32(result_bg, _mm512_srli_epi64(result_bg, 32)); 111 | // ... | _ _ g b 112 | result_bg = _mm512_shuffle_epi32(result_bg, _MM_PERM_DDCA); 113 | 114 | // ... | _ _ r r 115 | __m512i result_r0 = _mm512_madd_epi16(pixels_r0, weights); 116 | // ... | _ _ _ r 117 | result_r0 = _mm512_add_epi32(result_r0, _mm512_srli_epi64(result_r0, 32)); 118 | 119 | // Add adjacent pairs again. 32 bpc. 120 | // ... | _ r g b 121 | __m512i out = _mm512_unpacklo_epi64(result_bg, result_r0); 122 | 123 | // Divide by 256 to get back into correct range. 124 | out = _mm512_srli_epi32(out, 8); 125 | 126 | // Convert from 32bpc => 16bpc => 8bpc 127 | out = _mm512_packus_epi32(out, _mm512_setzero_si512()); 128 | out = _mm512_packus_epi16(out, _mm512_setzero_si512()); 129 | 130 | return out; 131 | } 132 | 133 | // Slightly faster than memcpy 134 | static inline void memcpy_12(uint8_t* dst, const uint8_t* src) { 135 | *((uint64_t*) dst) = *((uint64_t*) src); 136 | *((uint32_t*) (dst + 8)) = *((uint32_t*) (src + 8)); 137 | } 138 | 139 | static inline void write_output_pixels(__m512i pixels_1357, __m512i pixels_2468, 140 | interpolate::BGRPixel output_pixels[8]) { 141 | 142 | // Unpack to get adjacent pixel data in lower 64 bits of each lane 143 | // _ _ 8 7 | _ _ 6 5 | _ _ 4 3 | _ _ 2 1 144 | __m512i combined = _mm512_unpacklo_epi32(pixels_1357, pixels_2468); 145 | 146 | // If AVX512 VBMI was available, _mm512_permutexvar_epi8 could pack 147 | // all the pixels into the lower 24 bytes in one instruction. 148 | 149 | // (unused) | 8 7 6 5 | (unused) | 4 3 2 1 150 | combined = _mm512_permutex_epi64(combined, _MM_SHUFFLE(3, 3, 2, 0)); 151 | 152 | // Pack the pixels into the lower 96 bits of lanes 1 and 3 153 | // (unused) | _ 8 7 6 5 | (unused) | _ 4 3 2 1 154 | combined = _mm512_shuffle_epi8(combined, 155 | _mm512_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, // unused 156 | -1, -1, -1, -1, -1, -1, -1, -1, // unused 157 | -1, -1, -1, -1, // unused 158 | 30, 29, 28, 26, 25, 24, // 8, 7 159 | 22, 21, 20, 18, 17, 16, // 6, 5 160 | -1, -1, -1, -1, -1, -1, -1, -1, // unused 161 | -1, -1, -1, -1, -1, -1, -1, -1, // unused 162 | -1, -1, -1, -1, // unused 163 | 14, 13, 12, 10, 9, 8, // 4, 3 164 | 6, 5, 4, 2, 1, 0 // 2, 1 165 | )); 166 | 167 | // Store pixel data 168 | alignas(64) uint8_t stored[64]; 169 | _mm512_store_si512((__m512i*) stored, combined); 170 | 171 | // Write pixel data back to image. 172 | memcpy_12((uint8_t*) output_pixels, stored); 173 | memcpy_12((uint8_t*) (output_pixels + 4), stored + 32); 174 | } 175 | 176 | // Bilinear interpolation of 8 adjacent output pixels with the supplied coordinates using AVX512. 177 | static inline void interpolate(const interpolate::BGRImage& image, 178 | const interpolate::InputCoords input_coords[8], 179 | interpolate::BGRPixel output_pixels[8]) { 180 | 181 | const __m512i weights = calculate_weights(&input_coords[0].y); 182 | 183 | const __m512i weights_1357 = _mm512_unpacklo_epi64(weights, weights); 184 | const __m512i pixels_1357 = interpolate_four_pixels(image, input_coords, weights_1357); 185 | 186 | const __m512i weights_2468 = _mm512_unpackhi_epi64(weights, weights); 187 | const __m512i pixels_2468 = interpolate_four_pixels(image, input_coords + 1, weights_2468); 188 | 189 | write_output_pixels(pixels_1357, pixels_2468, output_pixels); 190 | } 191 | 192 | } // namespace interpolate::bilinear::avx512 193 | -------------------------------------------------------------------------------- /Benchmarks.md: -------------------------------------------------------------------------------- 1 | ## Benchmark Results 2 | 3 | ### 2020-08-11 4 | 5 | Apply weights computation and storage improvements to AVX512. 6 | 7 | ``` 8 | Input image size: 3840x2160 9 | Output image size: 1280x720 10 | OpenCV: numberOfCPUS=16 getNumThreads=16 11 | 2020-08-11 12:14:27 12 | Running ./bilinear_filter_simd 13 | Run on (16 X 3450.49 MHz CPU s) 14 | CPU Caches: 15 | L1 Data 32 KiB (x8) 16 | L1 Instruction 32 KiB (x8) 17 | L2 Unified 1024 KiB (x8) 18 | L3 Unified 25344 KiB (x1) 19 | Load Average: 0.53, 0.83, 0.39 20 | --------------------------------------------------------------------------------- 21 | Benchmark Time CPU Iterations 22 | --------------------------------------------------------------------------------- 23 | No SIMD - single thread/min_time:2.000 13.3 ms 13.3 ms 224 24 | No SIMD - multi thread/min_time:2.000 1.23 ms 1.23 ms 2283 25 | SSE4 - single thread/min_time:2.000 9.33 ms 9.33 ms 304 26 | SSE4 - multi thread/min_time:2.000 0.933 ms 0.933 ms 2992 27 | AVX2 - single thread/min_time:2.000 6.92 ms 6.92 ms 405 28 | AVX2 - multi thread/min_time:2.000 0.806 ms 0.806 ms 3450 29 | AVX512 - single thread/min_time:2.000 5.87 ms 5.87 ms 483 30 | AVX512 - multi thread/min_time:2.000 0.743 ms 0.743 ms 3777 31 | ``` 32 | 33 | ### 2020-08-10 34 | 35 | One less instruction to compute weights for SSE4 and AVX2. 36 | 37 | ``` 38 | Input image size: 3840x2160 39 | Output image size: 1280x720 40 | OpenCV: numberOfCPUS=12 getNumThreads=12 41 | 2020-08-10 23:10:15 42 | Running ./bilinear_filter_simd 43 | Run on (12 X 4600 MHz CPU s) 44 | CPU Caches: 45 | L1 Data 32 KiB (x6) 46 | L1 Instruction 32 KiB (x6) 47 | L2 Unified 256 KiB (x6) 48 | L3 Unified 12288 KiB (x1) 49 | Load Average: 0.45, 0.25, 0.20 50 | ------------------------------------------------------------------------------ 51 | Benchmark Time CPU Iterations 52 | ------------------------------------------------------------------------------ 53 | SSE4 - single thread/min_time:2.000 4.97 ms 4.97 ms 555 54 | AVX2 - single thread/min_time:2.000 4.05 ms 4.05 ms 691 55 | ``` 56 | 57 | Combined storage of output pixels into a single memcpy for SSE4 and AVX2. 58 | 59 | ``` 60 | Input image size: 3840x2160 61 | Output image size: 1280x720 62 | OpenCV: numberOfCPUS=12 getNumThreads=12 63 | 2020-08-10 22:20:51 64 | Running ./bilinear_filter_simd 65 | Run on (12 X 4600 MHz CPU s) 66 | CPU Caches: 67 | L1 Data 32 KiB (x6) 68 | L1 Instruction 32 KiB (x6) 69 | L2 Unified 256 KiB (x6) 70 | L3 Unified 12288 KiB (x1) 71 | Load Average: 3.25, 2.08, 1.04 72 | --------------------------------------------------------------------------------- 73 | Benchmark Time CPU Iterations 74 | --------------------------------------------------------------------------------- 75 | No SIMD - single thread/min_time:2.000 7.58 ms 7.58 ms 371 76 | No SIMD - multi thread/min_time:2.000 1.66 ms 1.64 ms 1696 77 | SSE4 - single thread/min_time:2.000 5.01 ms 5.01 ms 555 78 | SSE4 - multi thread/min_time:2.000 1.37 ms 1.37 ms 2056 79 | AVX2 - single thread/min_time:2.000 4.07 ms 4.07 ms 689 80 | AVX2 - multi thread/min_time:2.000 1.33 ms 1.33 ms 2098 81 | ``` 82 | 83 | ### 2020-08-09 84 | 85 | Implemented AVX512. Benchmarks from a c5.4xlarge EC2 instance. 86 | 87 | ``` 88 | Input image size: 3840x2160 89 | Output image size: 1280x720 90 | OpenCV: numberOfCPUS=16 getNumThreads=16 91 | 2020-08-09 11:34:16 92 | Running ./bilinear_filter_simd 93 | Run on (16 X 3400.97 MHz CPU s) 94 | CPU Caches: 95 | L1 Data 32 KiB (x8) 96 | L1 Instruction 32 KiB (x8) 97 | L2 Unified 1024 KiB (x8) 98 | L3 Unified 25344 KiB (x1) 99 | Load Average: 0.00, 0.04, 0.13 100 | --------------------------------------------------------------------------------- 101 | Benchmark Time CPU Iterations 102 | --------------------------------------------------------------------------------- 103 | No SIMD - single thread/min_time:2.000 13.3 ms 13.3 ms 215 104 | No SIMD - multi thread/min_time:2.000 1.25 ms 1.25 ms 2227 105 | SSE4 - single thread/min_time:2.000 10.7 ms 10.7 ms 267 106 | SSE4 - multi thread/min_time:2.000 0.987 ms 0.986 ms 2856 107 | AVX2 - single thread/min_time:2.000 8.14 ms 8.14 ms 360 108 | AVX2 - multi thread/min_time:2.000 0.868 ms 0.868 ms 3253 109 | AVX512 - single thread/min_time:2.000 7.33 ms 7.33 ms 373 110 | AVX512 - multi thread/min_time:2.000 0.822 ms 0.822 ms 3430 111 | ``` 112 | 113 | ### 2020-08-05 114 | 115 | After optimising data loading for weight calculation in SSE4/AVX2 implementations. 116 | 117 | ``` 118 | 2020-08-05 22:00:27 119 | Running ./bilinear_filter_simd 120 | Run on (12 X 4600 MHz CPU s) 121 | CPU Caches: 122 | L1 Data 32 KiB (x6) 123 | L1 Instruction 32 KiB (x6) 124 | L2 Unified 256 KiB (x6) 125 | L3 Unified 12288 KiB (x1) 126 | Load Average: 0.27, 0.21, 0.14 127 | --------------------------------------------------------------------------------- 128 | Benchmark Time CPU Iterations 129 | --------------------------------------------------------------------------------- 130 | No SIMD - single thread/min_time:2.000 7.48 ms 7.48 ms 375 131 | SSE4 - single thread/min_time:2.000 5.21 ms 5.21 ms 538 132 | AVX2 - single thread/min_time:2.000 4.23 ms 4.23 ms 661 133 | ``` 134 | 135 | ### 2020-08-03 136 | 137 | After applying same technique to AVX2 and calculating 4 weights at once with 16 bit ints. 138 | Not much difference. 139 | 140 | ``` 141 | 2020-08-03 21:10:56 142 | Running ./bilinear_filter_simd 143 | Run on (12 X 4600 MHz CPU s) 144 | CPU Caches: 145 | L1 Data 32 KiB (x6) 146 | L1 Instruction 32 KiB (x6) 147 | L2 Unified 256 KiB (x6) 148 | L3 Unified 12288 KiB (x1) 149 | Load Average: 5.54, 2.48, 1.00 150 | --------------------------------------------------------------------------------- 151 | Benchmark Time CPU Iterations 152 | --------------------------------------------------------------------------------- 153 | No SIMD - single thread/min_time:2.000 7.58 ms 7.58 ms 369 154 | No SIMD - multi thread/min_time:2.000 1.68 ms 1.68 ms 1661 155 | SSE4 - single thread/min_time:2.000 5.52 ms 5.52 ms 505 156 | SSE4 - multi thread/min_time:2.000 1.42 ms 1.42 ms 1947 157 | AVX2 - single thread/min_time:2.000 4.63 ms 4.63 ms 603 158 | AVX2 - multi thread/min_time:2.000 1.33 ms 1.33 ms 2110 159 | ``` 160 | 161 | ### 2020-08-02 162 | 163 | After calculating two weights at once with SSE4 using 16 bit ints: 164 | 165 | ``` 166 | 2020-08-03 00:35:04 167 | Running ./bilinear_filter_simd 168 | Run on (12 X 4600 MHz CPU s) 169 | CPU Caches: 170 | L1 Data 32 KiB (x6) 171 | L1 Instruction 32 KiB (x6) 172 | L2 Unified 256 KiB (x6) 173 | L3 Unified 12288 KiB (x1) 174 | Load Average: 0.49, 0.40, 0.34 175 | --------------------------------------------------------------------------------- 176 | Benchmark Time CPU Iterations 177 | --------------------------------------------------------------------------------- 178 | No SIMD - single thread/min_time:2.000 7.60 ms 7.60 ms 346 179 | No SIMD - multi thread/min_time:2.000 1.68 ms 1.68 ms 1657 180 | SSE4 - single thread/min_time:2.000 5.49 ms 5.49 ms 511 181 | SSE4 - multi thread/min_time:2.000 1.42 ms 1.42 ms 1972 182 | AVX2 - single thread/min_time:2.000 4.67 ms 4.67 ms 596 183 | AVX2 - multi thread/min_time:2.000 1.37 ms 1.37 ms 2049 184 | ``` 185 | 186 | After introducing `BGRImage` type: 187 | 188 | ``` 189 | 2020-08-02 16:27:20 190 | Running ./bilinear_filter_simd 191 | Run on (12 X 4600 MHz CPU s) 192 | CPU Caches: 193 | L1 Data 32 KiB (x6) 194 | L1 Instruction 32 KiB (x6) 195 | L2 Unified 256 KiB (x6) 196 | L3 Unified 12288 KiB (x1) 197 | Load Average: 1.53, 1.62, 1.10 198 | --------------------------------------------------------------------------------- 199 | Benchmark Time CPU Iterations 200 | --------------------------------------------------------------------------------- 201 | No SIMD - single thread/min_time:2.000 7.59 ms 7.59 ms 369 202 | No SIMD - multi thread/min_time:2.000 1.68 ms 1.68 ms 1665 203 | SSE4 - single thread/min_time:2.000 5.88 ms 5.88 ms 473 204 | SSE4 - multi thread/min_time:2.000 1.47 ms 1.47 ms 1898 205 | AVX2 - single thread/min_time:2.000 4.64 ms 4.64 ms 601 206 | AVX2 - multi thread/min_time:2.000 1.35 ms 1.35 ms 2068 207 | ``` 208 | 209 | Before introducing `BGRImage` type: 210 | 211 | ``` 212 | 2020-08-02 15:33:51 213 | Running ./bilinear_filter_simd 214 | Run on (12 X 4600 MHz CPU s) 215 | CPU Caches: 216 | L1 Data 32 KiB (x6) 217 | L1 Instruction 32 KiB (x6) 218 | L2 Unified 256 KiB (x6) 219 | L3 Unified 12288 KiB (x1) 220 | Load Average: 0.04, 0.09, 0.48 221 | --------------------------------------------------------------------------------- 222 | Benchmark Time CPU Iterations 223 | --------------------------------------------------------------------------------- 224 | No SIMD - single thread/min_time:2.000 8.12 ms 8.12 ms 346 225 | No SIMD - multi thread/min_time:2.000 1.72 ms 1.72 ms 1623 226 | SSE4 - single thread/min_time:5.000 6.18 ms 6.18 ms 453 227 | SSE4 - multi thread/min_time:2.000 1.50 ms 1.50 ms 1866 228 | AVX2 - single thread/min_time:2.000 5.29 ms 5.29 ms 528 229 | AVX2 - multi thread/min_time:2.000 1.36 ms 1.36 ms 2050 230 | ``` 231 | 232 | ### 2020-07-18 233 | 234 | ``` 235 | 2020-07-18 14:12:07 236 | Running ./bilinear_filter_simd 237 | Run on (12 X 4600 MHz CPU s) 238 | CPU Caches: 239 | L1 Data 32 KiB (x6) 240 | L1 Instruction 32 KiB (x6) 241 | L2 Unified 256 KiB (x6) 242 | L3 Unified 12288 KiB (x1) 243 | Load Average: 3.22, 1.50, 0.86 244 | --------------------------------------------------------------------------------- 245 | Benchmark Time CPU Iterations 246 | --------------------------------------------------------------------------------- 247 | No SIMD - single thread/min_time:2.000 10.4 ms 10.4 ms 269 248 | No SIMD - multi thread/min_time:2.000 2.02 ms 2.02 ms 1382 249 | SSE4 - single thread/min_time:2.000 7.04 ms 7.04 ms 394 250 | SSE4 - multi thread/min_time:2.000 1.60 ms 1.60 ms 1749 251 | AVX2 - single thread/min_time:2.000 5.81 ms 5.81 ms 480 252 | AVX2 - multi thread/min_time:2.000 1.32 ms 1.32 ms 2127 253 | ``` 254 | 255 | ### 2020-06-14 256 | 257 | ``` 258 | 2020-06-14 00:58:30 259 | Running ./bilinear_filter_simd 260 | Run on (4 X 3100 MHz CPU s) 261 | CPU Caches: 262 | L1 Data 32 KiB (x2) 263 | L1 Instruction 32 KiB (x2) 264 | L2 Unified 256 KiB (x2) 265 | L3 Unified 4096 KiB (x1) 266 | Load Average: 2.21, 2.64, 2.35 267 | -------------------------------------------------------------------------------------------- 268 | Benchmark Time CPU Iterations 269 | -------------------------------------------------------------------------------------------- 270 | BM_interpolate_plain_single_thread/min_time:2.000 26.6 ms 26.4 ms 101 271 | BM_interpolate_plain_multi_thread/min_time:2.000 5.45 ms 4.87 ms 576 272 | BM_interpolate_sse4_single_thread/min_time:2.000 24.2 ms 24.0 ms 118 273 | BM_interpolate_sse4_multi_thread/min_time:2.000 4.04 ms 3.56 ms 822 274 | BM_interpolate_avx2_single_thread/min_time:2.000 13.1 ms 13.0 ms 217 275 | BM_interpolate_avx2_multi_thread/min_time:2.000 2.49 ms 2.31 ms 1196 276 | ``` 277 | --------------------------------------------------------------------------------