├── LICENSE.txt ├── README.md ├── apple_amx.h ├── cpu.cpp ├── cpu.h ├── rgb2yuv.cpp ├── rgb2yuv.h ├── rgb2yuv.inl ├── yuv.h ├── yuv2rgb.cpp ├── yuv2rgb.h ├── yuv2rgb.inl ├── yuv2rgb_amx.cpp ├── yuv2rgb_amx.h ├── yuv2yuva.cpp ├── yuv2yuva.h └── yuv2yuva.inl /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020-2021 TAiGA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xxYUV 2 | Convert between RGB and YUV 3 | 4 | ## Benchmark Environment 5 | https://github.com/metarutaiga/xxImGui/tree/experimental 6 | 7 | ## Performance (macOS) Encode / Decode (unit : us) 8 | 9 | | | | Encode | | Decode | | 10 | | ---------- | --------- | ----------- | ----------- | ----------- | ----------- | 11 | | | Apple M1 | YU12 / YV12 | NV12 / NV21 | YU12 / YV12 | NV12 / NV21 | 12 | | xxYUV | AMX | ? | ? | 67 | ? | 13 | | xxYUV | NEON | 37 | 38 | 38 | 42 | 14 | | libyuv | NEON | 48 | 49 | 122 | 89 | 15 | | Accelerate | NEON | 56 | 55 | 62 | 59 | 16 | | xxYUV | SSSE3 | 134 | 133 | | | 17 | | libyuv | SSSE3 | 146 | 146 | 171 | 164 | 18 | | Accelerate | SSSE3 | 273 | 274 | 232 | 231 | 19 | | xxYUV | SSE2 | 142 | 143 | 58 | 56 | 20 | 21 | | | | Encode | | Decode | | 22 | | ---------- | --------- | ----------- | ----------- | ----------- | ----------- | 23 | | | i7-8700B | YU12 / YV12 | NV12 / NV21 | YU12 / YV12 | NV12 / NV21 | 24 | | xxYUV | AVX2 | 31 | 33 | 46 | 39 | 25 | | libyuv | AVX2 | 48 | 39 | 60 | 54 | 26 | | Accelerate | AVX2 | 83 | 84 | 67 | 62 | 27 | | xxYUV | SSSE3 | 50 | 51 | | | 28 | | libyuv | SSSE3 | 60 | 61 | 87 | 82 | 29 | | xxYUV | SSE2 | 90 | 91 | 69 | 62 | 30 | -------------------------------------------------------------------------------- /apple_amx.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : Apple AMX Header 3 | // 4 | // Copyright (c) 2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #if defined(__APPLE__) && defined(__aarch64__) 10 | //------------------------------------------------------------------------------ 11 | // https://gist.github.com/dougallj/7a75a3be1ec69ca550e7c36dc75e0d6f 12 | // https://gist.github.com/dougallj/7cba721da1a94da725ee37c1e9cd1f21 13 | //------------------------------------------------------------------------------ 14 | #include 15 | #include 16 | //------------------------------------------------------------------------------ 17 | union amx_operands_access 18 | { 19 | struct 20 | { 21 | uint64_t memory_offset:56; 22 | uint64_t register_index:6; 23 | uint64_t double_width:1; 24 | uint64_t dummy_63:1; 25 | }; 26 | uint64_t value; 27 | }; 28 | //------------------------------------------------------------------------------ 29 | union amx_operands_extract 30 | { 31 | struct 32 | { 33 | uint64_t offset_y:10; 34 | uint64_t offset_x:10; 35 | uint64_t offset_z:7; 36 | uint64_t dummy_27:37; 37 | }; 38 | uint64_t value; 39 | }; 40 | //------------------------------------------------------------------------------ 41 | union amx_operands_scalar 42 | { 43 | struct 44 | { 45 | uint64_t offset_y:10; 46 | uint64_t offset_x:10; 47 | uint64_t offset_z:7; 48 | uint64_t skip_z:1; 49 | uint64_t skip_y:1; 50 | uint64_t skip_x:1; 51 | uint64_t dummy_30:2; 52 | uint64_t disable_x:7; 53 | uint64_t dummy_39:2; 54 | uint64_t disable_y:7; 55 | uint64_t dummy_48:13; 56 | uint64_t mode_8:1; 57 | uint64_t mode_32:1; 58 | uint64_t vector_matrix_add:1; 59 | }; 60 | uint64_t value; 61 | }; 62 | //------------------------------------------------------------------------------ 63 | union amx_operands_vector 64 | { 65 | struct 66 | { 67 | uint64_t offset_y:10; 68 | uint64_t offset_x:10; 69 | uint64_t offset_z:7; 70 | uint64_t count_y:2; 71 | uint64_t count_x:2; 72 | uint64_t dummy_31:1; 73 | uint64_t mask:10; 74 | uint64_t extended:4; 75 | uint64_t dummy_46:1; 76 | uint64_t neg:1; 77 | uint64_t add:1; 78 | uint64_t dummy_49:9; 79 | uint64_t shift_right:5; 80 | uint64_t sign:1; 81 | }; 82 | uint64_t value; 83 | }; 84 | //------------------------------------------------------------------------------ 85 | union amx_operands_matrix 86 | { 87 | struct 88 | { 89 | uint64_t offset_y:10; 90 | uint64_t offset_x:10; 91 | uint64_t offset_z:7; 92 | uint64_t dummy_27:5; 93 | uint64_t mask:10; 94 | uint64_t extended:4; 95 | uint64_t dummy_46:1; 96 | uint64_t neg:1; 97 | uint64_t add:1; 98 | uint64_t dummy_49:9; 99 | uint64_t shift_right:5; 100 | uint64_t sign:1; 101 | }; 102 | uint64_t value; 103 | }; 104 | //------------------------------------------------------------------------------ 105 | #define amx_ldx(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 0 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 106 | #define amx_ldy(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 1 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 107 | #define amx_stx(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 2 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 108 | #define amx_sty(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 3 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 109 | #define amx_ldz(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 4 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 110 | #define amx_stz(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 5 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 111 | #define amx_ldzi(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 6 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 112 | #define amx_stzi(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 7 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 113 | #define amx_extrx(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 8 << 5) | 0)" ::"r"((amx_operands_extract{__VA_ARGS__})) : "x0", "memory") 114 | #define amx_extry(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | ( 9 << 5) | 0)" ::"r"((amx_operands_extract{__VA_ARGS__})) : "x0", "memory") 115 | #define amx_fma64(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (10 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 116 | #define amx_fms64(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (11 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 117 | #define amx_fma32(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (12 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 118 | #define amx_fms32(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (13 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 119 | #define amx_mac16(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (14 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 120 | #define amx_fma16(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (15 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 121 | #define amx_fms16(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (16 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory") 122 | #define amx_set() __asm__ volatile("nop \n nop \n nop \n .word (0x201000 | (17 << 5) | 0)" ::: "memory") 123 | #define amx_clr() __asm__ volatile("nop \n nop \n nop \n .word (0x201000 | (17 << 5) | 1)" ::: "memory") 124 | #define amx_vecint(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (18 << 5) | 0)" ::"r"((amx_operands_vector{__VA_ARGS__})) : "x0", "memory") 125 | #define amx_vecfp(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (19 << 5) | 0)" ::"r"((amx_operands_vector{__VA_ARGS__})) : "x0", "memory") 126 | #define amx_matint(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (20 << 5) | 0)" ::"r"((amx_operands_matrix{__VA_ARGS__})) : "x0", "memory") 127 | #define amx_matfp(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (21 << 5) | 0)" ::"r"((amx_operands_matrix{__VA_ARGS__})) : "x0", "memory") 128 | #define amx_gemlut(...) __asm__ volatile("mov x0, %0 \n .word (0x201000 | (22 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory") 129 | //------------------------------------------------------------------------------ 130 | inline void amx_dump(int index, int16_t hint) 131 | { 132 | uint8_t row[64]; 133 | amx_stz( .memory_offset = (uint64_t)row, .register_index = (uint64_t)index ); 134 | printf("%2d : ", index); 135 | for (int i = 0; i < 64; ++i) 136 | { 137 | if (i == 0) 138 | { 139 | printf("(%04X) ", __builtin_bswap16(hint) & 0xFFFF); 140 | } 141 | printf("%02X", row[i]); 142 | if (i % 8 == 7) 143 | printf(" "); 144 | } 145 | printf("\n"); 146 | } 147 | //------------------------------------------------------------------------------ 148 | #endif 149 | -------------------------------------------------------------------------------- /cpu.cpp: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : cpu Source 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 8 | # if defined(_M_IX86) || defined(_M_AMD64) 9 | # include 10 | # define bit_SSSE3 (1 << 9) 11 | # define bit_AVX2 (1 << 5) 12 | # define bit_AVX512BW (1 << 30) 13 | static inline int __get_cpuid(int leaf, unsigned int* eax, unsigned int* ebx, unsigned int* ecx, unsigned int* edx) 14 | { 15 | int regs[4]; 16 | __cpuid(regs, leaf); 17 | *eax = regs[0]; 18 | *ebx = regs[1]; 19 | *ecx = regs[2]; 20 | *edx = regs[3]; 21 | return 1; 22 | } 23 | static inline int __get_cpuid_count(int leaf, int subleaf, unsigned int* eax, unsigned int* ebx, unsigned int* ecx, unsigned int* edx) 24 | { 25 | int regs[4]; 26 | __cpuidex(regs, leaf, subleaf); 27 | *eax = regs[0]; 28 | *ebx = regs[1]; 29 | *ecx = regs[2]; 30 | *edx = regs[3]; 31 | return 1; 32 | } 33 | # elif defined(__i386__) || defined(__amd64__) 34 | # include 35 | # endif 36 | # include 37 | #endif 38 | #include "cpu.h" 39 | 40 | //------------------------------------------------------------------------------ 41 | bool ssse3() 42 | { 43 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 44 | unsigned int eax, ebx, ecx, edx; 45 | if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) 46 | { 47 | return (ecx & bit_SSSE3) != 0; 48 | } 49 | #endif 50 | return false; 51 | } 52 | //------------------------------------------------------------------------------ 53 | bool avx2() 54 | { 55 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 56 | unsigned int eax, ebx, ecx, edx; 57 | if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) 58 | { 59 | return (ebx & bit_AVX2) != 0; 60 | } 61 | #endif 62 | return false; 63 | } 64 | //------------------------------------------------------------------------------ 65 | bool avx512bw() 66 | { 67 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 68 | unsigned int eax, ebx, ecx, edx; 69 | if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) 70 | { 71 | return (ebx & bit_AVX512BW) != 0; 72 | } 73 | #endif 74 | return false; 75 | } 76 | //------------------------------------------------------------------------------ 77 | -------------------------------------------------------------------------------- /cpu.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : cpu Header 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #ifndef xxYUV_EXPORT 10 | #define xxYUV_EXPORT 11 | #endif 12 | 13 | //------------------------------------------------------------------------------ 14 | inline bool neon() { return true; } 15 | //------------------------------------------------------------------------------ 16 | inline bool sse2() { return true; } 17 | xxYUV_EXPORT bool ssse3(); 18 | xxYUV_EXPORT bool avx2(); 19 | xxYUV_EXPORT bool avx512bw(); 20 | //------------------------------------------------------------------------------ 21 | -------------------------------------------------------------------------------- /rgb2yuv.cpp: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : rgb2yuv Source 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #if defined(__llvm__) 8 | # pragma clang diagnostic ignored "-Wunused-variable" 9 | #endif 10 | #include "cpu.h" 11 | #include "rgb2yuv.inl" 12 | #include "rgb2yuv.h" 13 | 14 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1)) 15 | 16 | //------------------------------------------------------------------------------ 17 | void rgb2yuv_yu12(const rgb2yuv_parameter* parameter) 18 | { 19 | int width = parameter->width; 20 | int height = parameter->height; 21 | 22 | const void* rgb = parameter->rgb; 23 | int componentRGB = parameter->componentRGB; 24 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 25 | bool swizzleRGB = parameter->swizzleRGB; 26 | if (strideRGB < 0) 27 | { 28 | rgb = (char*)rgb - (strideRGB * (height - 1)); 29 | } 30 | 31 | void* y = parameter->y; 32 | void* u = parameter->u; 33 | void* v = parameter->v; 34 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 35 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 36 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 37 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 38 | int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2; 39 | int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2; 40 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 41 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 42 | bool videoRange = parameter->videoRange; 43 | 44 | u = u ? u : (char*)y + sizeY; 45 | v = v ? v : (char*)y + sizeY + sizeU; 46 | 47 | void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV); 48 | 49 | if (componentRGB == 3) 50 | { 51 | if (swizzleRGB) 52 | { 53 | if (videoRange) 54 | { 55 | static auto select = rgb2yuv_select(3, true, false, false, true); 56 | converter = select; 57 | } 58 | else 59 | { 60 | static auto select = rgb2yuv_select(3, true, false, false, false); 61 | converter = select; 62 | } 63 | } 64 | else 65 | { 66 | if (videoRange) 67 | { 68 | static auto select = rgb2yuv_select(3, false, false, false, true); 69 | converter = select; 70 | } 71 | else 72 | { 73 | static auto select = rgb2yuv_select(3, false, false, false, false); 74 | converter = select; 75 | } 76 | } 77 | } 78 | else if (componentRGB == 4) 79 | { 80 | if (swizzleRGB) 81 | { 82 | if (videoRange) 83 | { 84 | static auto select = rgb2yuv_select(4, true, false, false, true); 85 | converter = select; 86 | } 87 | else 88 | { 89 | static auto select = rgb2yuv_select(4, true, false, false, false); 90 | converter = select; 91 | } 92 | } 93 | else 94 | { 95 | if (videoRange) 96 | { 97 | static auto select = rgb2yuv_select(4, false, false, false, true); 98 | converter = select; 99 | } 100 | else 101 | { 102 | static auto select = rgb2yuv_select(4, false, false, false, false); 103 | converter = select; 104 | } 105 | } 106 | } 107 | else 108 | { 109 | return; 110 | } 111 | 112 | converter(width, height, rgb, strideRGB, y, u, v, strideY, strideU, strideU); 113 | } 114 | //------------------------------------------------------------------------------ 115 | void rgb2yuv_yv12(const rgb2yuv_parameter* parameter) 116 | { 117 | int width = parameter->width; 118 | int height = parameter->height; 119 | 120 | const void* rgb = parameter->rgb; 121 | int componentRGB = parameter->componentRGB; 122 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 123 | bool swizzleRGB = parameter->swizzleRGB; 124 | if (strideRGB < 0) 125 | { 126 | rgb = (char*)rgb - (strideRGB * (height - 1)); 127 | } 128 | 129 | void* y = parameter->y; 130 | void* u = parameter->u; 131 | void* v = parameter->v; 132 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 133 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 134 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 135 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 136 | int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2; 137 | int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2; 138 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 139 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 140 | bool videoRange = parameter->videoRange; 141 | 142 | u = u ? u : (char*)y + sizeY + sizeU; 143 | v = v ? v : (char*)y + sizeY; 144 | 145 | void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV); 146 | 147 | if (componentRGB == 3) 148 | { 149 | if (swizzleRGB) 150 | { 151 | if (videoRange) 152 | { 153 | static auto select = rgb2yuv_select(3, true, false, false, true); 154 | converter = select; 155 | } 156 | else 157 | { 158 | static auto select = rgb2yuv_select(3, true, false, false, false); 159 | converter = select; 160 | } 161 | } 162 | else 163 | { 164 | if (videoRange) 165 | { 166 | static auto select = rgb2yuv_select(3, false, false, false, true); 167 | converter = select; 168 | } 169 | else 170 | { 171 | static auto select = rgb2yuv_select(3, false, false, false, false); 172 | converter = select; 173 | } 174 | } 175 | } 176 | else if (componentRGB == 4) 177 | { 178 | if (swizzleRGB) 179 | { 180 | if (videoRange) 181 | { 182 | static auto select = rgb2yuv_select(4, true, false, false, true); 183 | converter = select; 184 | } 185 | else 186 | { 187 | static auto select = rgb2yuv_select(4, true, false, false, false); 188 | converter = select; 189 | } 190 | } 191 | else 192 | { 193 | if (videoRange) 194 | { 195 | static auto select = rgb2yuv_select(4, false, false, false, true); 196 | converter = select; 197 | } 198 | else 199 | { 200 | static auto select = rgb2yuv_select(4, false, false, false, false); 201 | converter = select; 202 | } 203 | } 204 | } 205 | else 206 | { 207 | return; 208 | } 209 | 210 | converter(width, height, rgb, strideRGB, y, u, v, strideY, strideU, strideU); 211 | } 212 | //------------------------------------------------------------------------------ 213 | void rgb2yuv_nv12(const rgb2yuv_parameter* parameter) 214 | { 215 | int width = parameter->width; 216 | int height = parameter->height; 217 | 218 | const void* rgb = parameter->rgb; 219 | int componentRGB = parameter->componentRGB; 220 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 221 | bool swizzleRGB = parameter->swizzleRGB; 222 | if (strideRGB < 0) 223 | { 224 | rgb = (char*)rgb - (strideRGB * (height - 1)); 225 | } 226 | 227 | void* y = parameter->y; 228 | void* u = parameter->u; 229 | void* v = parameter->v; 230 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 231 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 232 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 233 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 234 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 235 | int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize); 236 | bool videoRange = parameter->videoRange; 237 | 238 | u = u ? u : (char*)y + sizeY; 239 | v = v ? v : (char*)y + sizeY + 1; 240 | 241 | void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV); 242 | 243 | if (componentRGB == 3) 244 | { 245 | if (swizzleRGB) 246 | { 247 | if (videoRange) 248 | { 249 | static auto select = rgb2yuv_select(3, true, true, true, true); 250 | converter = select; 251 | } 252 | else 253 | { 254 | static auto select = rgb2yuv_select(3, true, true, true, false); 255 | converter = select; 256 | } 257 | } 258 | else 259 | { 260 | if (videoRange) 261 | { 262 | static auto select = rgb2yuv_select(3, false, true, true, true); 263 | converter = select; 264 | } 265 | else 266 | { 267 | static auto select = rgb2yuv_select(3, false, true, true, false); 268 | converter = select; 269 | } 270 | } 271 | } 272 | else if (componentRGB == 4) 273 | { 274 | if (swizzleRGB) 275 | { 276 | if (videoRange) 277 | { 278 | static auto select = rgb2yuv_select(4, true, true, true, true); 279 | converter = select; 280 | } 281 | else 282 | { 283 | static auto select = rgb2yuv_select(4, true, true, true, false); 284 | converter = select; 285 | } 286 | } 287 | else 288 | { 289 | if (videoRange) 290 | { 291 | static auto select = rgb2yuv_select(4, false, true, true, true); 292 | converter = select; 293 | } 294 | else 295 | { 296 | static auto select = rgb2yuv_select(4, false, true, true, false); 297 | converter = select; 298 | } 299 | } 300 | } 301 | else 302 | { 303 | return; 304 | } 305 | 306 | converter(width, height, rgb, strideRGB, y, u, v, strideY, strideY, strideY); 307 | } 308 | //------------------------------------------------------------------------------ 309 | void rgb2yuv_nv21(const rgb2yuv_parameter* parameter) 310 | { 311 | int width = parameter->width; 312 | int height = parameter->height; 313 | 314 | const void* rgb = parameter->rgb; 315 | int componentRGB = parameter->componentRGB; 316 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 317 | bool swizzleRGB = parameter->swizzleRGB; 318 | if (strideRGB < 0) 319 | { 320 | rgb = (char*)rgb - (strideRGB * (height - 1)); 321 | } 322 | 323 | void* y = parameter->y; 324 | void* u = parameter->u; 325 | void* v = parameter->v; 326 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 327 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 328 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 329 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 330 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 331 | int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize); 332 | bool videoRange = parameter->videoRange; 333 | 334 | u = u ? u : (char*)y + sizeY + 1; 335 | v = v ? v : (char*)y + sizeY; 336 | 337 | void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV); 338 | 339 | if (componentRGB == 3) 340 | { 341 | if (swizzleRGB) 342 | { 343 | if (videoRange) 344 | { 345 | static auto select = rgb2yuv_select(3, true, true, false, true); 346 | converter = select; 347 | } 348 | else 349 | { 350 | static auto select = rgb2yuv_select(3, true, true, false, false); 351 | converter = select; 352 | } 353 | } 354 | else 355 | { 356 | if (videoRange) 357 | { 358 | static auto select = rgb2yuv_select(3, false, true, false, true); 359 | converter = select; 360 | } 361 | else 362 | { 363 | static auto select = rgb2yuv_select(3, false, true, false, false); 364 | converter = select; 365 | } 366 | } 367 | } 368 | else if (componentRGB == 4) 369 | { 370 | if (swizzleRGB) 371 | { 372 | if (videoRange) 373 | { 374 | static auto select = rgb2yuv_select(4, true, true, false, true); 375 | converter = select; 376 | } 377 | else 378 | { 379 | static auto select = rgb2yuv_select(4, true, true, false, false); 380 | converter = select; 381 | } 382 | } 383 | else 384 | { 385 | if (videoRange) 386 | { 387 | static auto select = rgb2yuv_select(4, false, true, false, true); 388 | converter = select; 389 | } 390 | else 391 | { 392 | static auto select = rgb2yuv_select(4, false, true, false, false); 393 | converter = select; 394 | } 395 | } 396 | } 397 | else 398 | { 399 | return; 400 | } 401 | 402 | converter(width, height, rgb, strideRGB, y, u, v, strideY, strideY, strideY); 403 | } 404 | //------------------------------------------------------------------------------ 405 | -------------------------------------------------------------------------------- /rgb2yuv.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : rgb2yuv Header 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #ifndef xxYUV_EXPORT 10 | #define xxYUV_EXPORT 11 | #endif 12 | 13 | //------------------------------------------------------------------------------ 14 | typedef struct _rgb2yuv_parameter 15 | { 16 | int width; 17 | int height; 18 | 19 | const void* rgb; 20 | int componentRGB; 21 | int strideRGB; 22 | bool swizzleRGB; 23 | 24 | void* y; 25 | void* u; 26 | void* v; 27 | int alignWidth; 28 | int alignHeight; 29 | int alignSize; 30 | int strideY; 31 | int strideU; 32 | int strideV; 33 | bool videoRange; 34 | } rgb2yuv_parameter; 35 | //------------------------------------------------------------------------------ 36 | xxYUV_EXPORT void rgb2yuv_yu12(const rgb2yuv_parameter* parameter); 37 | xxYUV_EXPORT void rgb2yuv_yv12(const rgb2yuv_parameter* parameter); 38 | xxYUV_EXPORT void rgb2yuv_nv12(const rgb2yuv_parameter* parameter); 39 | xxYUV_EXPORT void rgb2yuv_nv21(const rgb2yuv_parameter* parameter); 40 | //------------------------------------------------------------------------------ 41 | #ifndef xxYUV_DEPRECATED 42 | //------------------------------------------------------------------------------ 43 | inline void rgb2yuv_yu12(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 44 | { 45 | rgb2yuv_parameter parameter = 46 | { 47 | .width = width, 48 | .height = height, 49 | .rgb = rgb, 50 | .componentRGB = rgbWidth, 51 | .strideRGB = strideRGB, 52 | .swizzleRGB = rgbSwizzle, 53 | .y = yuv, 54 | .alignWidth = alignWidth, 55 | .alignHeight = alignHeight, 56 | .alignSize = alignSize, 57 | .videoRange = !fullRange, 58 | }; 59 | rgb2yuv_yu12(¶meter); 60 | } 61 | //------------------------------------------------------------------------------ 62 | inline void rgb2yuv_yv12(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 63 | { 64 | rgb2yuv_parameter parameter = 65 | { 66 | .width = width, 67 | .height = height, 68 | .rgb = rgb, 69 | .componentRGB = rgbWidth, 70 | .strideRGB = strideRGB, 71 | .swizzleRGB = rgbSwizzle, 72 | .y = yuv, 73 | .alignWidth = alignWidth, 74 | .alignHeight = alignHeight, 75 | .alignSize = alignSize, 76 | .videoRange = !fullRange, 77 | }; 78 | rgb2yuv_yv12(¶meter); 79 | } 80 | //------------------------------------------------------------------------------ 81 | inline void rgb2yuv_nv12(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 82 | { 83 | rgb2yuv_parameter parameter = 84 | { 85 | .width = width, 86 | .height = height, 87 | .rgb = rgb, 88 | .componentRGB = rgbWidth, 89 | .strideRGB = strideRGB, 90 | .swizzleRGB = rgbSwizzle, 91 | .y = yuv, 92 | .alignWidth = alignWidth, 93 | .alignHeight = alignHeight, 94 | .alignSize = alignSize, 95 | .videoRange = !fullRange, 96 | }; 97 | rgb2yuv_nv12(¶meter); 98 | } 99 | //------------------------------------------------------------------------------ 100 | inline void rgb2yuv_nv21(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 101 | { 102 | rgb2yuv_parameter parameter = 103 | { 104 | .width = width, 105 | .height = height, 106 | .rgb = rgb, 107 | .componentRGB = rgbWidth, 108 | .strideRGB = strideRGB, 109 | .swizzleRGB = rgbSwizzle, 110 | .y = yuv, 111 | .alignWidth = alignWidth, 112 | .alignHeight = alignHeight, 113 | .alignSize = alignSize, 114 | .videoRange = !fullRange, 115 | }; 116 | rgb2yuv_nv21(¶meter); 117 | } 118 | //------------------------------------------------------------------------------ 119 | #endif 120 | //------------------------------------------------------------------------------ 121 | -------------------------------------------------------------------------------- /rgb2yuv.inl: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : rgb2yuv Inline 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | // BT.709 - Video Range 8 | // R G B 9 | // Y = 0.18275 0.61477 0.06200 10 | // U = -0.10072 -0.33882 0.43931 11 | // V = 0.43867 -0.40048 -0.04038 12 | // 13 | // BT.709 - Full Range 14 | // R G B 15 | // Y = 0.21260 0.71520 0.07220 16 | // U = -0.11412 -0.38392 0.49804 17 | // V = 0.49804 -0.45237 -0.04567 18 | #define fRY 0.21260 19 | #define fGY 0.71520 20 | #define fBY 0.07220 21 | #define fRU -0.11412 22 | #define fGU -0.38392 23 | #define fBU 0.49804 24 | #define fRV 0.49804 25 | #define fGV -0.45237 26 | #define fBV -0.04567 27 | #define vRY 0.18275 28 | #define vGY 0.61477 29 | #define vBY 0.06200 30 | #define vRU -0.10072 31 | #define vGU -0.33882 32 | #define vBU 0.43931 33 | #define vRV 0.43867 34 | #define vGV -0.40048 35 | #define vBV -0.04038 36 | 37 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) 38 | # include 39 | #elif defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 40 | # include 41 | # include 42 | # if defined(__llvm__) 43 | # include 44 | # include 45 | # endif 46 | # if defined(_MSC_VER) && !defined(__llvm__) 47 | # define _mm_shuffle_ps(a, b, c) (__m128i&)_mm_shuffle_ps((__m128&)a, (__m128&)b, c) 48 | # define _mm256_shuffle_ps(a, b, c) (__m256i&)_mm256_shuffle_ps((__m256&)a, (__m256&)b, c) 49 | # define _mm_movehl_ps(a, b) (__m128i&)_mm_movehl_ps((__m128&)a, (__m128&)b) 50 | # define _mm_storel_pi(a, b) _mm_storel_pi(a, (__m128&)b) 51 | # define _mm_storeh_pi(a, b) _mm_storeh_pi(a, (__m128&)b) 52 | # endif 53 | #endif 54 | 55 | //------------------------------------------------------------------------------ 56 | template 57 | void rgb2yuv(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV) 58 | { 59 | int halfWidth = width >> 1; 60 | int halfHeight = height >> 1; 61 | 62 | int iR = swizzleRGB ? 2 : 0; 63 | int iG = 1; 64 | int iB = swizzleRGB ? 0 : 2; 65 | int iA = 3; 66 | 67 | int Y[3], U[3], V[3]; 68 | if (videoRange) 69 | { 70 | Y[iR] = (int)(vRY * 256); U[iR] = (int)(vRU * 255); V[iR] = (int)(vRV * 255); 71 | Y[iG] = (int)(vGY * 256); U[iG] = (int)(vGU * 255); V[iG] = (int)(vGV * 255); 72 | Y[iB] = (int)(vBY * 256); U[iB] = (int)(vBU * 255); V[iB] = (int)(vBV * 255); 73 | } 74 | else 75 | { 76 | Y[iR] = (int)(fRY * 256); U[iR] = (int)(fRU * 255); V[iR] = (int)(fRV * 255); 77 | Y[iG] = (int)(fGY * 256); U[iG] = (int)(fGU * 255); V[iG] = (int)(fGV * 255); 78 | Y[iB] = (int)(fBY * 256); U[iB] = (int)(fBU * 255); V[iB] = (int)(fBV * 255); 79 | } 80 | 81 | for (int h = 0; h < halfHeight; ++h) 82 | { 83 | const unsigned char* rgb0 = (unsigned char*)rgb; 84 | const unsigned char* rgb1 = rgb0 + strideRGB; rgb = rgb1 + strideRGB; 85 | unsigned char* y0 = (unsigned char*)y; 86 | unsigned char* y1 = y0 + strideY; y = y1 + strideY; 87 | unsigned char* u0 = (unsigned char*)u; u = u0 + strideU; 88 | unsigned char* v0 = (unsigned char*)v; v = v0 + strideV; 89 | #if HAVE_NEON 90 | int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0; 91 | for (int w = 0; w < halfWidth8; ++w) 92 | { 93 | uint8x16x4_t rgb00 = vld4q_u8(rgb0); rgb0 += 16 * 4; 94 | uint8x16x4_t rgb10 = vld4q_u8(rgb1); rgb1 += 16 * 4; 95 | 96 | uint8x8_t r00 = vget_low_u8(rgb00.val[0]); 97 | uint8x8_t g00 = vget_low_u8(rgb00.val[1]); 98 | uint8x8_t b00 = vget_low_u8(rgb00.val[2]); 99 | uint8x8_t r01 = vget_high_u8(rgb00.val[0]); 100 | uint8x8_t g01 = vget_high_u8(rgb00.val[1]); 101 | uint8x8_t b01 = vget_high_u8(rgb00.val[2]); 102 | uint8x8_t r10 = vget_low_u8(rgb10.val[0]); 103 | uint8x8_t g10 = vget_low_u8(rgb10.val[1]); 104 | uint8x8_t b10 = vget_low_u8(rgb10.val[2]); 105 | uint8x8_t r11 = vget_high_u8(rgb10.val[0]); 106 | uint8x8_t g11 = vget_high_u8(rgb10.val[1]); 107 | uint8x8_t b11 = vget_high_u8(rgb10.val[2]); 108 | 109 | uint8x8_t y00 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r00, vdup_n_u8(Y[0])), g00, vdup_n_u8(Y[1])), b00, vdup_n_u8(Y[2])), 8); 110 | uint8x8_t y01 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r01, vdup_n_u8(Y[0])), g01, vdup_n_u8(Y[1])), b01, vdup_n_u8(Y[2])), 8); 111 | uint8x8_t y10 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r10, vdup_n_u8(Y[0])), g10, vdup_n_u8(Y[1])), b10, vdup_n_u8(Y[2])), 8); 112 | uint8x8_t y11 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r11, vdup_n_u8(Y[0])), g11, vdup_n_u8(Y[1])), b11, vdup_n_u8(Y[2])), 8); 113 | uint8x16_t y000 = vcombine_u8(y00, y01); 114 | uint8x16_t y100 = vcombine_u8(y10, y11); 115 | if (videoRange) 116 | { 117 | y000 = vqaddq_u8(vcombine_u8(y00, y01), vdupq_n_u8(16)); 118 | y100 = vqaddq_u8(vcombine_u8(y10, y11), vdupq_n_u8(16)); 119 | } 120 | else 121 | { 122 | y000 = vcombine_u8(y00, y01); 123 | y100 = vcombine_u8(y10, y11); 124 | } 125 | 126 | int16x8_t r000 = vpadalq_u8(vpaddlq_u8(rgb00.val[0]), rgb10.val[0]); 127 | int16x8_t g000 = vpadalq_u8(vpaddlq_u8(rgb00.val[1]), rgb10.val[1]); 128 | int16x8_t b000 = vpadalq_u8(vpaddlq_u8(rgb00.val[2]), rgb10.val[2]); 129 | 130 | uint8x8_t u00 = vrshrn_n_s16(vmlaq_s16(vmlaq_s16(vmulq_s16(r000, vdupq_n_s16(U[0] >> 2)), g000, vdupq_n_s16(U[1] >> 2)), b000, vdupq_n_s16(U[2] >> 2)), 8); 131 | uint8x8_t v00 = vrshrn_n_s16(vmlaq_s16(vmlaq_s16(vmulq_s16(r000, vdupq_n_s16(V[0] >> 2)), g000, vdupq_n_s16(V[1] >> 2)), b000, vdupq_n_s16(V[2] >> 2)), 8); 132 | u00 = vadd_u8(u00, vdup_n_u8(128)); 133 | v00 = vadd_u8(v00, vdup_n_u8(128)); 134 | 135 | vst1q_u8(y0, y000); y0 += 16; 136 | vst1q_u8(y1, y100); y1 += 16; 137 | if (interleaved) 138 | { 139 | if (firstU) 140 | { 141 | uint8x8x2_t uv00 = vzip_u8(u00, v00); 142 | vst1q_u8(u0, vcombine_u8(uv00.val[0], uv00.val[1])); u0 += 16; 143 | } 144 | else 145 | { 146 | uint8x8x2_t uv00 = vzip_u8(v00, u00); 147 | vst1q_u8(v0, vcombine_u8(uv00.val[0], uv00.val[1])); v0 += 16; 148 | } 149 | } 150 | else 151 | { 152 | vst1_u8(u0, u00); u0 += 8; 153 | vst1_u8(v0, v00); v0 += 8; 154 | } 155 | } 156 | if (componentRGB == 4) 157 | continue; 158 | #elif HAVE_AVX2 159 | int halfWidth16 = (componentRGB == 4) ? halfWidth / 16 : 0; 160 | for (int w = 0; w < halfWidth16; ++w) 161 | { 162 | __m256i rgb00[4] = { _mm256_loadu_si256((__m256i*)rgb0), _mm256_loadu_si256((__m256i*)rgb0 + 1), _mm256_loadu_si256((__m256i*)rgb0 + 2), _mm256_loadu_si256((__m256i*)rgb0 + 3) }; rgb0 += 32 * 4; 163 | __m256i rgb10[4] = { _mm256_loadu_si256((__m256i*)rgb1), _mm256_loadu_si256((__m256i*)rgb1 + 1), _mm256_loadu_si256((__m256i*)rgb1 + 2), _mm256_loadu_si256((__m256i*)rgb1 + 3) }; rgb1 += 32 * 4; 164 | 165 | __m256i yy = _mm256_setr_epi8(Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 166 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 167 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 168 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 169 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 170 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 171 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 172 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0); 173 | __m256i yy000 = _mm256_maddubs_epi16(rgb00[0], yy); 174 | __m256i yy001 = _mm256_maddubs_epi16(rgb00[1], yy); 175 | __m256i yy010 = _mm256_maddubs_epi16(rgb00[2], yy); 176 | __m256i yy011 = _mm256_maddubs_epi16(rgb00[3], yy); 177 | __m256i yy100 = _mm256_maddubs_epi16(rgb10[0], yy); 178 | __m256i yy101 = _mm256_maddubs_epi16(rgb10[1], yy); 179 | __m256i yy110 = _mm256_maddubs_epi16(rgb10[2], yy); 180 | __m256i yy111 = _mm256_maddubs_epi16(rgb10[3], yy); 181 | __m256i y00 = _mm256_hadd_epi16(yy000, yy001); 182 | __m256i y01 = _mm256_hadd_epi16(yy010, yy011); 183 | __m256i y10 = _mm256_hadd_epi16(yy100, yy101); 184 | __m256i y11 = _mm256_hadd_epi16(yy110, yy111); 185 | y00 = _mm256_srli_epi16(y00, 7); 186 | y01 = _mm256_srli_epi16(y01, 7); 187 | y10 = _mm256_srli_epi16(y10, 7); 188 | y11 = _mm256_srli_epi16(y11, 7); 189 | __m256i y000 = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(y00, y01), _mm256_setr_epi32(0,4,1,5,2,6,3,7)); 190 | __m256i y100 = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(y10, y11), _mm256_setr_epi32(0,4,1,5,2,6,3,7)); 191 | if (videoRange) 192 | { 193 | y000 = _mm256_adds_epu8(y000, _mm256_set1_epi8(16)); 194 | y100 = _mm256_adds_epu8(y100, _mm256_set1_epi8(16)); 195 | } 196 | 197 | __m256i uv00 = _mm256_avg_epu8(rgb00[0], rgb10[0]); 198 | __m256i uv01 = _mm256_avg_epu8(rgb00[1], rgb10[1]); 199 | __m256i uv10 = _mm256_avg_epu8(rgb00[2], rgb10[2]); 200 | __m256i uv11 = _mm256_avg_epu8(rgb00[3], rgb10[3]); 201 | __m256i uv0 = _mm256_avg_epu8(_mm256_shuffle_ps(uv00, uv01, _MM_SHUFFLE(2,0,2,0)), _mm256_shuffle_ps(uv00, uv01, _MM_SHUFFLE(3,1,3,1))); 202 | __m256i uv1 = _mm256_avg_epu8(_mm256_shuffle_ps(uv10, uv11, _MM_SHUFFLE(2,0,2,0)), _mm256_shuffle_ps(uv10, uv11, _MM_SHUFFLE(3,1,3,1))); 203 | __m256i uu = _mm256_setr_epi8(U[0], U[1], U[2], 0, 204 | U[0], U[1], U[2], 0, 205 | U[0], U[1], U[2], 0, 206 | U[0], U[1], U[2], 0, 207 | U[0], U[1], U[2], 0, 208 | U[0], U[1], U[2], 0, 209 | U[0], U[1], U[2], 0, 210 | U[0], U[1], U[2], 0); 211 | __m256i vv = _mm256_setr_epi8(V[0], V[1], V[2], 0, 212 | V[0], V[1], V[2], 0, 213 | V[0], V[1], V[2], 0, 214 | V[0], V[1], V[2], 0, 215 | V[0], V[1], V[2], 0, 216 | V[0], V[1], V[2], 0, 217 | V[0], V[1], V[2], 0, 218 | V[0], V[1], V[2], 0); 219 | __m256i uu00 = _mm256_maddubs_epi16(uv0, uu); 220 | __m256i uu01 = _mm256_maddubs_epi16(uv1, uu); 221 | __m256i vv00 = _mm256_maddubs_epi16(uv0, vv); 222 | __m256i vv01 = _mm256_maddubs_epi16(uv1, vv); 223 | __m256i uu02 = _mm256_hadd_epi16(uu00, uu01); 224 | __m256i vv02 = _mm256_hadd_epi16(vv00, vv01); 225 | uu02 = _mm256_srai_epi16(uu02, 8); 226 | vv02 = _mm256_srai_epi16(vv02, 8); 227 | __m256i mask = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 228 | 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); 229 | __m256i uv02 = _mm256_shuffle_epi8(_mm256_permute4x64_epi64(_mm256_packs_epi16(uu02, vv02), _MM_SHUFFLE(3,1,2,0)), mask); 230 | uv02 = _mm256_sub_epi8(uv02, _mm256_set1_epi8(-128)); 231 | 232 | _mm256_storeu_si256((__m256i*)y0, y000); y0 += 32; 233 | _mm256_storeu_si256((__m256i*)y1, y100); y1 += 32; 234 | if (interleaved) 235 | { 236 | __m128i u00 = _mm256_extractf128_si256(uv02, 0); 237 | __m128i v00 = _mm256_extractf128_si256(uv02, 1); 238 | if (firstU) 239 | { 240 | __m256i uv00 = _mm256_setr_m128i(_mm_unpacklo_epi8(u00, v00), _mm_unpackhi_epi8(u00, v00)); 241 | _mm256_storeu_si256((__m256i*)u0, uv00); u0 += 32; 242 | } 243 | else 244 | { 245 | __m256i uv00 = _mm256_setr_m128i(_mm_unpacklo_epi8(v00, u00), _mm_unpackhi_epi8(v00, u00)); 246 | _mm256_storeu_si256((__m256i*)v0, uv00); v0 += 32; 247 | } 248 | } 249 | else 250 | { 251 | _mm256_storeu2_m128i((__m128i*)v0, (__m128i*)u0, uv02); u0 += 16; v0 += 16; 252 | } 253 | } 254 | if (componentRGB == 4) 255 | continue; 256 | #elif HAVE_SSE2 || HAVE_SSSE3 257 | int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0; 258 | for (int w = 0; w < halfWidth8; ++w) 259 | { 260 | __m128i rgb00[4] = { _mm_loadu_si128((__m128i*)rgb0), _mm_loadu_si128((__m128i*)rgb0 + 1), _mm_loadu_si128((__m128i*)rgb0 + 2), _mm_loadu_si128((__m128i*)rgb0 + 3) }; rgb0 += 16 * 4; 261 | __m128i rgb10[4] = { _mm_loadu_si128((__m128i*)rgb1), _mm_loadu_si128((__m128i*)rgb1 + 1), _mm_loadu_si128((__m128i*)rgb1 + 2), _mm_loadu_si128((__m128i*)rgb1 + 3) }; rgb1 += 16 * 4; 262 | 263 | #if HAVE_SSSE3 264 | __m128i yy = _mm_setr_epi8(Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 265 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 266 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 267 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0); 268 | __m128i yy000 = _mm_maddubs_epi16(rgb00[0], yy); 269 | __m128i yy001 = _mm_maddubs_epi16(rgb00[1], yy); 270 | __m128i yy010 = _mm_maddubs_epi16(rgb00[2], yy); 271 | __m128i yy011 = _mm_maddubs_epi16(rgb00[3], yy); 272 | __m128i yy100 = _mm_maddubs_epi16(rgb10[0], yy); 273 | __m128i yy101 = _mm_maddubs_epi16(rgb10[1], yy); 274 | __m128i yy110 = _mm_maddubs_epi16(rgb10[2], yy); 275 | __m128i yy111 = _mm_maddubs_epi16(rgb10[3], yy); 276 | __m128i y00 = _mm_hadd_epi16(yy000, yy001); 277 | __m128i y01 = _mm_hadd_epi16(yy010, yy011); 278 | __m128i y10 = _mm_hadd_epi16(yy100, yy101); 279 | __m128i y11 = _mm_hadd_epi16(yy110, yy111); 280 | #else 281 | __m128i yy = _mm_setr_epi16(Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0, 282 | Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0); 283 | __m128i yy000 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[0], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[0], __m128i()), yy)); 284 | __m128i yy001 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[1], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[1], __m128i()), yy)); 285 | __m128i yy010 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[2], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[2], __m128i()), yy)); 286 | __m128i yy011 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[3], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[3], __m128i()), yy)); 287 | __m128i yy100 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[0], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[0], __m128i()), yy)); 288 | __m128i yy101 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[1], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[1], __m128i()), yy)); 289 | __m128i yy110 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[2], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[2], __m128i()), yy)); 290 | __m128i yy111 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[3], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[3], __m128i()), yy)); 291 | __m128i y00 = _mm_packs_epi32(_mm_madd_epi16(yy000, _mm_set1_epi16(1)), _mm_madd_epi16(yy001, _mm_set1_epi16(1))); 292 | __m128i y01 = _mm_packs_epi32(_mm_madd_epi16(yy010, _mm_set1_epi16(1)), _mm_madd_epi16(yy011, _mm_set1_epi16(1))); 293 | __m128i y10 = _mm_packs_epi32(_mm_madd_epi16(yy100, _mm_set1_epi16(1)), _mm_madd_epi16(yy101, _mm_set1_epi16(1))); 294 | __m128i y11 = _mm_packs_epi32(_mm_madd_epi16(yy110, _mm_set1_epi16(1)), _mm_madd_epi16(yy111, _mm_set1_epi16(1))); 295 | #endif 296 | y00 = _mm_srli_epi16(y00, 7); 297 | y01 = _mm_srli_epi16(y01, 7); 298 | y10 = _mm_srli_epi16(y10, 7); 299 | y11 = _mm_srli_epi16(y11, 7); 300 | __m128i y000 = _mm_packus_epi16(y00, y01); 301 | __m128i y100 = _mm_packus_epi16(y10, y11); 302 | if (videoRange) 303 | { 304 | y000 = _mm_adds_epu8(y000, _mm_set1_epi8(16)); 305 | y100 = _mm_adds_epu8(y100, _mm_set1_epi8(16)); 306 | } 307 | 308 | __m128i uv00 = _mm_avg_epu8(rgb00[0], rgb10[0]); 309 | __m128i uv01 = _mm_avg_epu8(rgb00[1], rgb10[1]); 310 | __m128i uv10 = _mm_avg_epu8(rgb00[2], rgb10[2]); 311 | __m128i uv11 = _mm_avg_epu8(rgb00[3], rgb10[3]); 312 | __m128i uv0 = _mm_avg_epu8(_mm_shuffle_ps(uv00, uv01, _MM_SHUFFLE(2,0,2,0)), _mm_shuffle_ps(uv00, uv01, _MM_SHUFFLE(3,1,3,1))); 313 | __m128i uv1 = _mm_avg_epu8(_mm_shuffle_ps(uv10, uv11, _MM_SHUFFLE(2,0,2,0)), _mm_shuffle_ps(uv10, uv11, _MM_SHUFFLE(3,1,3,1))); 314 | #if HAVE_SSSE3 315 | __m128i uu = _mm_setr_epi8(U[0], U[1], U[2], 0, 316 | U[0], U[1], U[2], 0, 317 | U[0], U[1], U[2], 0, 318 | U[0], U[1], U[2], 0); 319 | __m128i vv = _mm_setr_epi8(V[0], V[1], V[2], 0, 320 | V[0], V[1], V[2], 0, 321 | V[0], V[1], V[2], 0, 322 | V[0], V[1], V[2], 0); 323 | __m128i uu00 = _mm_maddubs_epi16(uv0, uu); 324 | __m128i uu01 = _mm_maddubs_epi16(uv1, uu); 325 | __m128i vv00 = _mm_maddubs_epi16(uv0, vv); 326 | __m128i vv01 = _mm_maddubs_epi16(uv1, vv); 327 | __m128i u00 = _mm_hadd_epi16(uu00, uu01); 328 | __m128i v00 = _mm_hadd_epi16(vv00, vv01); 329 | #else 330 | __m128i uu = _mm_setr_epi16(U[0], U[1], U[2], 0, 331 | U[0], U[1], U[2], 0); 332 | __m128i vv = _mm_setr_epi16(V[0], V[1], V[2], 0, 333 | V[0], V[1], V[2], 0); 334 | __m128i uu00 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv0, __m128i()), uu), _mm_madd_epi16(_mm_unpackhi_epi8(uv0, __m128i()), uu)); 335 | __m128i uu01 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv1, __m128i()), uu), _mm_madd_epi16(_mm_unpackhi_epi8(uv1, __m128i()), uu)); 336 | __m128i vv00 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv0, __m128i()), vv), _mm_madd_epi16(_mm_unpackhi_epi8(uv0, __m128i()), vv)); 337 | __m128i vv01 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv1, __m128i()), vv), _mm_madd_epi16(_mm_unpackhi_epi8(uv1, __m128i()), vv)); 338 | __m128i u00 = _mm_packs_epi32(_mm_madd_epi16(uu00, _mm_set1_epi16(1)), _mm_madd_epi16(uu01, _mm_set1_epi16(1))); 339 | __m128i v00 = _mm_packs_epi32(_mm_madd_epi16(vv00, _mm_set1_epi16(1)), _mm_madd_epi16(vv01, _mm_set1_epi16(1))); 340 | #endif 341 | u00 = _mm_srai_epi16(u00, 8); 342 | v00 = _mm_srai_epi16(v00, 8); 343 | __m128i uv02 = _mm_packs_epi16(u00, v00); 344 | uv02 = _mm_sub_epi8(uv02, _mm_set1_epi8(-128)); 345 | 346 | _mm_storeu_si128((__m128i*)y0, y000); y0 += 16; 347 | _mm_storeu_si128((__m128i*)y1, y100); y1 += 16; 348 | if (interleaved) 349 | { 350 | u00 = uv02; 351 | v00 = _mm_movehl_ps(uv02, uv02); 352 | if (firstU) 353 | { 354 | __m128i uv00 = _mm_unpacklo_epi8(u00, v00); 355 | _mm_storeu_si128((__m128i*)u0, uv00); u0 += 16; 356 | } 357 | else 358 | { 359 | __m128i uv00 = _mm_unpacklo_epi8(v00, u00); 360 | _mm_storeu_si128((__m128i*)v0, uv00); v0 += 16; 361 | } 362 | } 363 | else 364 | { 365 | _mm_storel_pi((__m64*)u0, uv02); u0 += 8; 366 | _mm_storeh_pi((__m64*)v0, uv02); v0 += 8; 367 | } 368 | } 369 | if (componentRGB == 4) 370 | continue; 371 | #endif 372 | for (int w = 0; w < halfWidth; ++w) 373 | { 374 | int b00 = (componentRGB >= 1) ? rgb0[0] : 255; 375 | int g00 = (componentRGB >= 2) ? rgb0[1] : 255; 376 | int r00 = (componentRGB >= 3) ? rgb0[2] : 255; 377 | int a00 = (componentRGB >= 4) ? rgb0[3] : 255; rgb0 += componentRGB; 378 | int b01 = (componentRGB >= 1) ? rgb0[0] : 255; 379 | int g01 = (componentRGB >= 2) ? rgb0[1] : 255; 380 | int r01 = (componentRGB >= 3) ? rgb0[2] : 255; 381 | int a01 = (componentRGB >= 4) ? rgb0[3] : 255; rgb0 += componentRGB; 382 | int b10 = (componentRGB >= 1) ? rgb1[0] : 255; 383 | int g10 = (componentRGB >= 2) ? rgb1[1] : 255; 384 | int r10 = (componentRGB >= 3) ? rgb1[2] : 255; 385 | int a10 = (componentRGB >= 4) ? rgb1[3] : 255; rgb1 += componentRGB; 386 | int b11 = (componentRGB >= 1) ? rgb1[0] : 255; 387 | int g11 = (componentRGB >= 2) ? rgb1[1] : 255; 388 | int r11 = (componentRGB >= 3) ? rgb1[2] : 255; 389 | int a11 = (componentRGB >= 4) ? rgb1[3] : 255; rgb1 += componentRGB; 390 | 391 | int r000 = (r00 + r01 + r10 + r11) / 4; 392 | int g000 = (g00 + g01 + g10 + g11) / 4; 393 | int b000 = (b00 + b01 + b10 + b11) / 4; 394 | 395 | int y00 = r00 * Y[0] + g00 * Y[1] + b00 * Y[2]; 396 | int y01 = r01 * Y[0] + g01 * Y[1] + b01 * Y[2]; 397 | int y10 = r10 * Y[0] + g10 * Y[1] + b10 * Y[2]; 398 | int y11 = r11 * Y[0] + g11 * Y[1] + b11 * Y[2]; 399 | int u00 = r000 * U[0] + g000 * U[1] + b000 * U[2]; 400 | int v00 = r000 * V[0] + g000 * V[1] + b000 * V[2]; 401 | 402 | auto clamp = [](int value) -> unsigned char 403 | { 404 | return (unsigned char)(value < 255 ? value < 0 ? 0 : value : 255); 405 | }; 406 | 407 | if (videoRange) 408 | { 409 | (*y0++) = clamp((y00 >> 8) + 16); 410 | (*y0++) = clamp((y01 >> 8) + 16); 411 | (*y1++) = clamp((y10 >> 8) + 16); 412 | (*y1++) = clamp((y11 >> 8) + 16); 413 | } 414 | else 415 | { 416 | (*y0++) = clamp(y00 >> 8); 417 | (*y0++) = clamp(y01 >> 8); 418 | (*y1++) = clamp(y10 >> 8); 419 | (*y1++) = clamp(y11 >> 8); 420 | } 421 | (*u0++) = clamp((u00 >> 8) + 128); 422 | (*v0++) = clamp((v00 >> 8) + 128); 423 | if (interleaved) 424 | { 425 | u0++; 426 | v0++; 427 | } 428 | } 429 | } 430 | } 431 | //------------------------------------------------------------------------------ 432 | #ifndef rgb2yuv_select 433 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 434 | rgb2yuv 435 | #endif 436 | //------------------------------------------------------------------------------ 437 | #ifndef rgb2yuv 438 | //------------------------------------------------------------------------------ 439 | #if defined(__llvm__) 440 | #define rgb2yuv_attribute(value) __attribute__((target(value))) 441 | #else 442 | #define rgb2yuv_attribute(value) 443 | #endif 444 | //------------------------------------------------------------------------------ 445 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) 446 | #define HAVE_NEON 1 447 | #define rgb2yuv rgb2yuv_attribute("neon") rgb2yuv_neon 448 | #include "rgb2yuv.inl" 449 | #undef rgb2yuv 450 | #undef HAVE_NEON 451 | #undef rgb2yuv_select 452 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 453 | neon() ? rgb2yuv_neon : \ 454 | rgb2yuv 455 | #endif 456 | //------------------------------------------------------------------------------ 457 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 458 | #define HAVE_SSE2 1 459 | #define rgb2yuv rgb2yuv_attribute("sse2") rgb2yuv_sse2 460 | #include "rgb2yuv.inl" 461 | #undef rgb2yuv 462 | #undef HAVE_SSE2 463 | #undef rgb2yuv_select 464 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 465 | sse2() ? rgb2yuv_sse2 : \ 466 | rgb2yuv 467 | #endif 468 | //------------------------------------------------------------------------------ 469 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 470 | #define HAVE_SSSE3 1 471 | #define rgb2yuv rgb2yuv_attribute("ssse3") rgb2yuv_ssse3 472 | #include "rgb2yuv.inl" 473 | #undef rgb2yuv 474 | #undef HAVE_SSSE3 475 | #undef rgb2yuv_select 476 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 477 | ssse3() ? rgb2yuv_ssse3 : \ 478 | sse2() ? rgb2yuv_sse2 : \ 479 | rgb2yuv 480 | #endif 481 | //------------------------------------------------------------------------------ 482 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 483 | #define HAVE_AVX2 1 484 | #define rgb2yuv rgb2yuv_attribute("avx2") rgb2yuv_avx2 485 | #include "rgb2yuv.inl" 486 | #undef rgb2yuv 487 | #undef HAVE_AVX2 488 | #undef rgb2yuv_select 489 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 490 | avx2() ? rgb2yuv_avx2 : \ 491 | ssse3() ? rgb2yuv_ssse3 : \ 492 | sse2() ? rgb2yuv_sse2 : \ 493 | rgb2yuv 494 | #endif 495 | //------------------------------------------------------------------------------ 496 | #endif 497 | //------------------------------------------------------------------------------ 498 | -------------------------------------------------------------------------------- /yuv.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv Header 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #ifndef xxYUV_EXPORT 10 | #define xxYUV_EXPORT 11 | #endif 12 | 13 | //------------------------------------------------------------------------------ 14 | #ifndef xxYUV_DEPRECATED 15 | #include "yuv2yuva.h" 16 | //------------------------------------------------------------------------------ 17 | inline void yuv_yu12_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 18 | { 19 | yuv2yuva_parameter parameter = 20 | { 21 | .width = width, 22 | .height = height, 23 | .y = input, 24 | .alignWidth = alignWidth, 25 | .alignHeight = alignHeight, 26 | .alignSize = alignSize, 27 | .output = output, 28 | .strideOutput = strideOutput, 29 | .swizzleOutput = yuvSwizzle, 30 | }; 31 | yuv2yuva_yu12(¶meter); 32 | } 33 | //------------------------------------------------------------------------------ 34 | inline void yuv_yv12_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 35 | { 36 | yuv2yuva_parameter parameter = 37 | { 38 | .width = width, 39 | .height = height, 40 | .y = input, 41 | .alignWidth = alignWidth, 42 | .alignHeight = alignHeight, 43 | .alignSize = alignSize, 44 | .output = output, 45 | .strideOutput = strideOutput, 46 | .swizzleOutput = yuvSwizzle, 47 | }; 48 | yuv2yuva_yv12(¶meter); 49 | } 50 | //------------------------------------------------------------------------------ 51 | inline void yuv_nv12_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 52 | { 53 | yuv2yuva_parameter parameter = 54 | { 55 | .width = width, 56 | .height = height, 57 | .y = input, 58 | .alignWidth = alignWidth, 59 | .alignHeight = alignHeight, 60 | .alignSize = alignSize, 61 | .output = output, 62 | .strideOutput = strideOutput, 63 | .swizzleOutput = yuvSwizzle, 64 | }; 65 | yuv2yuva_nv12(¶meter); 66 | } 67 | //------------------------------------------------------------------------------ 68 | inline void yuv_nv21_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 69 | { 70 | yuv2yuva_parameter parameter = 71 | { 72 | .width = width, 73 | .height = height, 74 | .y = input, 75 | .alignWidth = alignWidth, 76 | .alignHeight = alignHeight, 77 | .alignSize = alignSize, 78 | .output = output, 79 | .strideOutput = strideOutput, 80 | .swizzleOutput = yuvSwizzle, 81 | }; 82 | yuv2yuva_nv21(¶meter); 83 | } 84 | //------------------------------------------------------------------------------ 85 | #endif 86 | //------------------------------------------------------------------------------ 87 | -------------------------------------------------------------------------------- /yuv2rgb.cpp: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2rgb Source 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #if defined(__llvm__) 8 | # pragma clang diagnostic ignored "-Wunused-variable" 9 | #endif 10 | #include "cpu.h" 11 | #include "yuv2rgb.inl" 12 | #include "yuv2rgb.h" 13 | 14 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1)) 15 | 16 | //------------------------------------------------------------------------------ 17 | void yuv2rgb_yu12(const yuv2rgb_parameter* parameter) 18 | { 19 | int width = parameter->width; 20 | int height = parameter->height; 21 | 22 | const void* y = parameter->y; 23 | const void* u = parameter->u; 24 | const void* v = parameter->v; 25 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 26 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 27 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 28 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 29 | int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2; 30 | int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2; 31 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 32 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 33 | bool videoRange = parameter->videoRange; 34 | 35 | void* rgb = parameter->rgb; 36 | int componentRGB = parameter->componentRGB; 37 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 38 | bool swizzleRGB = parameter->swizzleRGB; 39 | if (strideRGB < 0) 40 | { 41 | rgb = (char*)rgb - (strideRGB * (height - 1)); 42 | } 43 | 44 | u = u ? u : (char*)y + sizeY; 45 | v = v ? v : (char*)y + sizeY + sizeU; 46 | 47 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB); 48 | 49 | if (componentRGB == 3) 50 | { 51 | if (swizzleRGB) 52 | { 53 | if (videoRange) 54 | { 55 | static auto select = yuv2rgb_select(3, true, false, false, true); 56 | converter = select; 57 | } 58 | else 59 | { 60 | static auto select = yuv2rgb_select(3, true, false, false, false); 61 | converter = select; 62 | } 63 | } 64 | else 65 | { 66 | if (videoRange) 67 | { 68 | static auto select = yuv2rgb_select(3, false, false, false, true); 69 | converter = select; 70 | } 71 | else 72 | { 73 | static auto select = yuv2rgb_select(3, false, false, false, false); 74 | converter = select; 75 | } 76 | } 77 | } 78 | else if (componentRGB == 4) 79 | { 80 | if (swizzleRGB) 81 | { 82 | if (videoRange) 83 | { 84 | static auto select = yuv2rgb_select(4, true, false, false, true); 85 | converter = select; 86 | } 87 | else 88 | { 89 | static auto select = yuv2rgb_select(4, true, false, false, false); 90 | converter = select; 91 | } 92 | } 93 | else 94 | { 95 | if (videoRange) 96 | { 97 | static auto select = yuv2rgb_select(4, false, false, false, true); 98 | converter = select; 99 | } 100 | else 101 | { 102 | static auto select = yuv2rgb_select(4, false, false, false, false); 103 | converter = select; 104 | } 105 | } 106 | } 107 | else 108 | { 109 | return; 110 | } 111 | 112 | converter(width, height, y, u, v, strideY, strideU, strideU, rgb, strideRGB); 113 | } 114 | //------------------------------------------------------------------------------ 115 | void yuv2rgb_yv12(const yuv2rgb_parameter* parameter) 116 | { 117 | int width = parameter->width; 118 | int height = parameter->height; 119 | 120 | const void* y = parameter->y; 121 | const void* u = parameter->u; 122 | const void* v = parameter->v; 123 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 124 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 125 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 126 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 127 | int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2; 128 | int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2; 129 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 130 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 131 | bool videoRange = parameter->videoRange; 132 | 133 | void* rgb = parameter->rgb; 134 | int componentRGB = parameter->componentRGB; 135 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 136 | bool swizzleRGB = parameter->swizzleRGB; 137 | if (strideRGB < 0) 138 | { 139 | rgb = (char*)rgb - (strideRGB * (height - 1)); 140 | } 141 | 142 | u = u ? u : (char*)y + sizeY + sizeU; 143 | v = v ? v : (char*)y + sizeY; 144 | 145 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB); 146 | 147 | if (componentRGB == 3) 148 | { 149 | if (swizzleRGB) 150 | { 151 | if (videoRange) 152 | { 153 | static auto select = yuv2rgb_select(3, true, false, false, true); 154 | converter = select; 155 | } 156 | else 157 | { 158 | static auto select = yuv2rgb_select(3, true, false, false, false); 159 | converter = select; 160 | } 161 | } 162 | else 163 | { 164 | if (videoRange) 165 | { 166 | static auto select = yuv2rgb_select(3, false, false, false, true); 167 | converter = select; 168 | } 169 | else 170 | { 171 | static auto select = yuv2rgb_select(3, false, false, false, false); 172 | converter = select; 173 | } 174 | } 175 | } 176 | else if (componentRGB == 4) 177 | { 178 | if (swizzleRGB) 179 | { 180 | if (videoRange) 181 | { 182 | static auto select = yuv2rgb_select(4, true, false, false, true); 183 | converter = select; 184 | } 185 | else 186 | { 187 | static auto select = yuv2rgb_select(4, true, false, false, false); 188 | converter = select; 189 | } 190 | } 191 | else 192 | { 193 | if (videoRange) 194 | { 195 | static auto select = yuv2rgb_select(4, false, false, false, true); 196 | converter = select; 197 | } 198 | else 199 | { 200 | static auto select = yuv2rgb_select(4, false, false, false, false); 201 | converter = select; 202 | } 203 | } 204 | } 205 | else 206 | { 207 | return; 208 | } 209 | 210 | converter(width, height, y, u, v, strideY, strideU, strideU, rgb, strideRGB); 211 | } 212 | //------------------------------------------------------------------------------ 213 | void yuv2rgb_nv12(const yuv2rgb_parameter* parameter) 214 | { 215 | int width = parameter->width; 216 | int height = parameter->height; 217 | 218 | const void* y = parameter->y; 219 | const void* u = parameter->u; 220 | const void* v = parameter->v; 221 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 222 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 223 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 224 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 225 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 226 | int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize); 227 | bool videoRange = parameter->videoRange; 228 | 229 | void* rgb = parameter->rgb; 230 | int componentRGB = parameter->componentRGB; 231 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 232 | bool swizzleRGB = parameter->swizzleRGB; 233 | if (strideRGB < 0) 234 | { 235 | rgb = (char*)rgb - (strideRGB * (height - 1)); 236 | } 237 | 238 | u = u ? u : (char*)y + sizeY; 239 | v = v ? v : (char*)y + sizeY + 1; 240 | 241 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB); 242 | 243 | if (componentRGB == 3) 244 | { 245 | if (swizzleRGB) 246 | { 247 | if (videoRange) 248 | { 249 | static auto select = yuv2rgb_select(3, true, true, true, true); 250 | converter = select; 251 | } 252 | else 253 | { 254 | static auto select = yuv2rgb_select(3, true, true, true, false); 255 | converter = select; 256 | } 257 | } 258 | else 259 | { 260 | if (videoRange) 261 | { 262 | static auto select = yuv2rgb_select(3, false, true, true, true); 263 | converter = select; 264 | } 265 | else 266 | { 267 | static auto select = yuv2rgb_select(3, false, true, true, false); 268 | converter = select; 269 | } 270 | } 271 | } 272 | else if (componentRGB == 4) 273 | { 274 | if (swizzleRGB) 275 | { 276 | if (videoRange) 277 | { 278 | static auto select = yuv2rgb_select(4, true, true, true, true); 279 | converter = select; 280 | } 281 | else 282 | { 283 | static auto select = yuv2rgb_select(4, true, true, true, false); 284 | converter = select; 285 | } 286 | } 287 | else 288 | { 289 | if (videoRange) 290 | { 291 | static auto select = yuv2rgb_select(4, false, true, true, true); 292 | converter = select; 293 | } 294 | else 295 | { 296 | static auto select = yuv2rgb_select(4, false, true, true, false); 297 | converter = select; 298 | } 299 | } 300 | } 301 | else 302 | { 303 | return; 304 | } 305 | 306 | converter(width, height, y, u, v, strideY, strideY, strideY, rgb, strideRGB); 307 | } 308 | //------------------------------------------------------------------------------ 309 | void yuv2rgb_nv21(const yuv2rgb_parameter* parameter) 310 | { 311 | int width = parameter->width; 312 | int height = parameter->height; 313 | 314 | const void* y = parameter->y; 315 | const void* u = parameter->u; 316 | const void* v = parameter->v; 317 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 318 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 319 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 320 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 321 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 322 | int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize); 323 | bool videoRange = parameter->videoRange; 324 | 325 | void* rgb = parameter->rgb; 326 | int componentRGB = parameter->componentRGB; 327 | int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width; 328 | bool swizzleRGB = parameter->swizzleRGB; 329 | if (strideRGB < 0) 330 | { 331 | rgb = (char*)rgb - (strideRGB * (height - 1)); 332 | } 333 | 334 | u = u ? u : (char*)y + sizeY + 1; 335 | v = v ? v : (char*)y + sizeY; 336 | 337 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB); 338 | 339 | if (componentRGB == 3) 340 | { 341 | if (swizzleRGB) 342 | { 343 | if (videoRange) 344 | { 345 | static auto select = yuv2rgb_select(3, true, true, false, true); 346 | converter = select; 347 | } 348 | else 349 | { 350 | static auto select = yuv2rgb_select(3, true, true, false, false); 351 | converter = select; 352 | } 353 | } 354 | else 355 | { 356 | if (videoRange) 357 | { 358 | static auto select = yuv2rgb_select(3, false, true, false, true); 359 | converter = select; 360 | } 361 | else 362 | { 363 | static auto select = yuv2rgb_select(3, false, true, false, false); 364 | converter = select; 365 | } 366 | } 367 | } 368 | else if (componentRGB == 4) 369 | { 370 | if (swizzleRGB) 371 | { 372 | if (videoRange) 373 | { 374 | static auto select = yuv2rgb_select(4, true, true, false, true); 375 | converter = select; 376 | } 377 | else 378 | { 379 | static auto select = yuv2rgb_select(4, true, true, false, false); 380 | converter = select; 381 | } 382 | } 383 | else 384 | { 385 | if (videoRange) 386 | { 387 | static auto select = yuv2rgb_select(4, false, true, false, true); 388 | converter = select; 389 | } 390 | else 391 | { 392 | static auto select = yuv2rgb_select(4, false, true, false, false); 393 | converter = select; 394 | } 395 | } 396 | } 397 | else 398 | { 399 | return; 400 | } 401 | 402 | converter(width, height, y, u, v, strideY, strideY, strideY, rgb, strideRGB); 403 | } 404 | //------------------------------------------------------------------------------ 405 | -------------------------------------------------------------------------------- /yuv2rgb.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2rgb Header 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #ifndef xxYUV_EXPORT 10 | #define xxYUV_EXPORT 11 | #endif 12 | 13 | //------------------------------------------------------------------------------ 14 | typedef struct _yuv2rgb_parameter 15 | { 16 | int width; 17 | int height; 18 | 19 | const void* y; 20 | const void* u; 21 | const void* v; 22 | int alignWidth; 23 | int alignHeight; 24 | int alignSize; 25 | int strideY; 26 | int strideU; 27 | int strideV; 28 | bool videoRange; 29 | 30 | void* rgb; 31 | int componentRGB; 32 | int strideRGB; 33 | bool swizzleRGB; 34 | } yuv2rgb_parameter; 35 | //------------------------------------------------------------------------------ 36 | xxYUV_EXPORT void yuv2rgb_yu12(const yuv2rgb_parameter* parameter); 37 | xxYUV_EXPORT void yuv2rgb_yv12(const yuv2rgb_parameter* parameter); 38 | xxYUV_EXPORT void yuv2rgb_nv12(const yuv2rgb_parameter* parameter); 39 | xxYUV_EXPORT void yuv2rgb_nv21(const yuv2rgb_parameter* parameter); 40 | //------------------------------------------------------------------------------ 41 | #ifndef xxYUV_DEPRECATED 42 | //------------------------------------------------------------------------------ 43 | inline void yuv2rgb_yu12(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 44 | { 45 | yuv2rgb_parameter parameter = 46 | { 47 | .width = width, 48 | .height = height, 49 | .y = yuv, 50 | .alignWidth = alignWidth, 51 | .alignHeight = alignHeight, 52 | .alignSize = alignSize, 53 | .videoRange = !fullRange, 54 | .rgb = rgb, 55 | .componentRGB = rgbWidth, 56 | .strideRGB = strideRGB, 57 | .swizzleRGB = rgbSwizzle, 58 | }; 59 | yuv2rgb_yu12(¶meter); 60 | } 61 | //------------------------------------------------------------------------------ 62 | inline void yuv2rgb_yv12(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 63 | { 64 | yuv2rgb_parameter parameter = 65 | { 66 | .width = width, 67 | .height = height, 68 | .y = yuv, 69 | .alignWidth = alignWidth, 70 | .alignHeight = alignHeight, 71 | .alignSize = alignSize, 72 | .videoRange = !fullRange, 73 | .rgb = rgb, 74 | .componentRGB = rgbWidth, 75 | .strideRGB = strideRGB, 76 | .swizzleRGB = rgbSwizzle, 77 | }; 78 | yuv2rgb_yv12(¶meter); 79 | } 80 | //------------------------------------------------------------------------------ 81 | inline void yuv2rgb_nv12(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 82 | { 83 | yuv2rgb_parameter parameter = 84 | { 85 | .width = width, 86 | .height = height, 87 | .y = yuv, 88 | .alignWidth = alignWidth, 89 | .alignHeight = alignHeight, 90 | .alignSize = alignSize, 91 | .videoRange = !fullRange, 92 | .rgb = rgb, 93 | .componentRGB = rgbWidth, 94 | .strideRGB = strideRGB, 95 | .swizzleRGB = rgbSwizzle, 96 | }; 97 | yuv2rgb_nv12(¶meter); 98 | } 99 | //------------------------------------------------------------------------------ 100 | inline void yuv2rgb_nv21(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1) 101 | { 102 | yuv2rgb_parameter parameter = 103 | { 104 | .width = width, 105 | .height = height, 106 | .y = yuv, 107 | .alignWidth = alignWidth, 108 | .alignHeight = alignHeight, 109 | .alignSize = alignSize, 110 | .videoRange = !fullRange, 111 | .rgb = rgb, 112 | .componentRGB = rgbWidth, 113 | .strideRGB = strideRGB, 114 | .swizzleRGB = rgbSwizzle, 115 | }; 116 | yuv2rgb_nv21(¶meter); 117 | } 118 | //------------------------------------------------------------------------------ 119 | #endif 120 | //------------------------------------------------------------------------------ 121 | -------------------------------------------------------------------------------- /yuv2rgb.inl: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2rgb Inline 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | // BT.709 - Video Range 8 | // Y U V 9 | // R = 1.164384 0.000000 1.792741 10 | // G = 1.164384 -0.213249 -0.532909 11 | // B = 1.164384 2.112402 0.000000 12 | // 13 | // BT.709 - Full Range 14 | // Y U V 15 | // R = 1.000000 0.000000 1.581000 16 | // G = 1.000000 -0.188062 -0.469967 17 | // B = 1.000000 1.862906 0.000000 18 | #define vY 1.164384 19 | #define vUG -0.213249 20 | #define vUB 2.112402 21 | #define vVR 1.792741 22 | #define vVG -0.532909 23 | #define fY 1.000000 24 | #define fUG -0.188062 25 | #define fUB 1.862906 26 | #define fVR 1.581000 27 | #define fVG -0.469967 28 | 29 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) 30 | # include 31 | # define NEON_FAST 1 32 | #elif defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 33 | # include 34 | # if defined(__llvm__) 35 | # include 36 | # include 37 | # endif 38 | # define _MM_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \ 39 | __m128i T0, T1, T2, T3; \ 40 | T0 = _mm_unpacklo_epi8(R0, R1); \ 41 | T1 = _mm_unpacklo_epi8(R2, R3); \ 42 | T2 = _mm_unpackhi_epi8(R0, R1); \ 43 | T3 = _mm_unpackhi_epi8(R2, R3); \ 44 | R0 = _mm_unpacklo_epi16(T0, T1); \ 45 | R1 = _mm_unpackhi_epi16(T0, T1); \ 46 | R2 = _mm_unpacklo_epi16(T2, T3); \ 47 | R3 = _mm_unpackhi_epi16(T2, T3); \ 48 | } 49 | # define _MM256_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \ 50 | __m256i T0, T1, T2, T3; \ 51 | T0 = _mm256_unpacklo_epi8(R0, R1); \ 52 | T1 = _mm256_unpacklo_epi8(R2, R3); \ 53 | T2 = _mm256_unpackhi_epi8(R0, R1); \ 54 | T3 = _mm256_unpackhi_epi8(R2, R3); \ 55 | R0 = _mm256_unpacklo_epi16(T0, T1); \ 56 | R1 = _mm256_unpackhi_epi16(T0, T1); \ 57 | R2 = _mm256_unpacklo_epi16(T2, T3); \ 58 | R3 = _mm256_unpackhi_epi16(T2, T3); \ 59 | } 60 | # define _MM256_TRANSPOSE4_SI128(R0, R1, R2, R3) {\ 61 | __m256i T0, T1, T2, T3; \ 62 | T0 = _mm256_permute2x128_si256(R0, R1, 32); \ 63 | T1 = _mm256_permute2x128_si256(R0, R1, 49); \ 64 | T2 = _mm256_permute2x128_si256(R2, R3, 32); \ 65 | T3 = _mm256_permute2x128_si256(R2, R3, 49); \ 66 | R0 = T0; \ 67 | R2 = T1; \ 68 | R1 = T2; \ 69 | R3 = T3; \ 70 | } 71 | # define _MM512_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \ 72 | __m512i T0, T1, T2, T3; \ 73 | T0 = _mm512_unpacklo_epi8(R0, R1); \ 74 | T1 = _mm512_unpacklo_epi8(R2, R3); \ 75 | T2 = _mm512_unpackhi_epi8(R0, R1); \ 76 | T3 = _mm512_unpackhi_epi8(R2, R3); \ 77 | R0 = _mm512_unpacklo_epi16(T0, T1); \ 78 | R1 = _mm512_unpackhi_epi16(T0, T1); \ 79 | R2 = _mm512_unpacklo_epi16(T2, T3); \ 80 | R3 = _mm512_unpackhi_epi16(T2, T3); \ 81 | } 82 | # define _MM512_TRANSPOSE4_SI128(R0, R1, R2, R3) {\ 83 | __m512i T0, T1, T2, T3; \ 84 | T0 = _mm512_shuffle_i32x4(R0, R1, 0x44); \ 85 | T1 = _mm512_shuffle_i32x4(R2, R3, 0x44); \ 86 | T2 = _mm512_shuffle_i32x4(R0, R1, 0xEE); \ 87 | T3 = _mm512_shuffle_i32x4(R2, R3, 0xEE); \ 88 | R0 = _mm512_shuffle_i32x4(T0, T1, 0x88); \ 89 | R1 = _mm512_shuffle_i32x4(T0, T1, 0xDD); \ 90 | R2 = _mm512_shuffle_i32x4(T2, T3, 0x88); \ 91 | R3 = _mm512_shuffle_i32x4(T2, T3, 0xDD); \ 92 | } 93 | #endif 94 | 95 | //------------------------------------------------------------------------------ 96 | template 97 | void yuv2rgb(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB) 98 | { 99 | int halfWidth = width >> 1; 100 | int halfHeight = height >> 1; 101 | 102 | int iR = swizzleRGB ? 2 : 0; 103 | int iG = 1; 104 | int iB = swizzleRGB ? 0 : 2; 105 | int iA = 3; 106 | 107 | int Y, UG, UB, VR, VG; 108 | if (videoRange) 109 | { 110 | Y = (int)(vY * 256); 111 | UG = (int)(vUG * 255); UB = (int)(vUB * 255); 112 | VR = (int)(vVR * 255); VG = (int)(vVG * 255); 113 | } 114 | else 115 | { 116 | Y = (int)(fY * 256); 117 | UG = (int)(fUG * 255); UB = (int)(fUB * 255); 118 | VR = (int)(fVR * 255); VG = (int)(fVG * 255); 119 | } 120 | 121 | for (int h = 0; h < halfHeight; ++h) 122 | { 123 | const unsigned char* y0 = (unsigned char*)y; 124 | const unsigned char* y1 = y0 + strideY; y = y1 + strideY; 125 | const unsigned char* u0 = (unsigned char*)u; u = u0 + strideU; 126 | const unsigned char* v0 = (unsigned char*)v; v = v0 + strideV; 127 | unsigned char* rgb0 = (unsigned char*)rgb; 128 | unsigned char* rgb1 = rgb0 + strideRGB; rgb = rgb1 + strideRGB; 129 | #if HAVE_NEON 130 | int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0; 131 | for (int w = 0; w < halfWidth8; ++w) 132 | { 133 | uint8x16_t y00lh = vld1q_u8(y0); y0 += 16; 134 | uint8x16_t y10lh = vld1q_u8(y1); y1 += 16; 135 | uint8x8_t y00; 136 | uint8x8_t y01; 137 | uint8x8_t y10; 138 | uint8x8_t y11; 139 | if (videoRange) 140 | { 141 | y00lh = vqsubq_u8(y00lh, vdupq_n_u8(16)); 142 | y10lh = vqsubq_u8(y10lh, vdupq_n_u8(16)); 143 | y00 = vshrn_n_u16(vmull_u8(vget_low_u8(y00lh), vdup_n_u8(Y >> 1)), 7); 144 | y01 = vshrn_n_u16(vmull_u8(vget_high_u8(y00lh), vdup_n_u8(Y >> 1)), 7); 145 | y10 = vshrn_n_u16(vmull_u8(vget_low_u8(y10lh), vdup_n_u8(Y >> 1)), 7); 146 | y11 = vshrn_n_u16(vmull_u8(vget_high_u8(y10lh), vdup_n_u8(Y >> 1)), 7); 147 | } 148 | else 149 | { 150 | y00 = vget_low_u8(y00lh); 151 | y01 = vget_high_u8(y00lh); 152 | y10 = vget_low_u8(y10lh); 153 | y11 = vget_high_u8(y10lh); 154 | } 155 | 156 | int8x8_t u000; 157 | int8x8_t v000; 158 | if (interleaved) 159 | { 160 | if (firstU) 161 | { 162 | int8x16_t uv00 = vld1q_u8(u0); u0 += 16; 163 | int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00)); 164 | int8x16_t uv000 = vaddq_s8(vcombine_s8(uv00lh.val[0], uv00lh.val[1]), vdupq_n_s8(-128)); 165 | u000 = vget_low_s8(uv000); 166 | v000 = vget_high_s8(uv000); 167 | } 168 | else 169 | { 170 | int8x16_t uv00 = vld1q_u8(v0); v0 += 16; 171 | int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00)); 172 | int8x16_t uv000 = vaddq_s8(vcombine_s8(uv00lh.val[1], uv00lh.val[0]), vdupq_n_s8(-128)); 173 | u000 = vget_low_s8(uv000); 174 | v000 = vget_high_s8(uv000); 175 | } 176 | } 177 | else 178 | { 179 | int8x16_t uv000 = vaddq_s8(vcombine_s8(vld1_u8(u0), vld1_u8(v0)), vdupq_n_s8(-128)); u0 += 8; v0 += 8; 180 | u000 = vget_low_s8(uv000); 181 | v000 = vget_high_s8(uv000); 182 | } 183 | 184 | #if NEON_FAST 185 | int16x8_t dR = vshrq_n_s16( vmull_s8(v000, vdup_n_s8(VR >> 2)), 6); 186 | int16x8_t dG = vshrq_n_s16(vmlal_s8(vmull_s8(u000, vdup_n_s8(UG >> 1)), v000, vdup_n_s8(VG >> 1)), 7); 187 | int16x8_t dB = vshrq_n_s16( vmull_s8(u000, vdup_n_s8(UB >> 3)), 5); 188 | #else 189 | int16x8_t u00 = vshll_n_s8(u000, 7); 190 | int16x8_t v00 = vshll_n_s8(v000, 7); 191 | 192 | int16x8_t dR = vqdmulhq_s16(v00, vdupq_n_s16(VR)); 193 | int16x8_t dG = vaddq_s16(vqdmulhq_s16(u00, vdupq_n_s16(UG)), vqdmulhq_s16(v00, vdupq_n_s16(VG))); 194 | int16x8_t dB = vqdmulhq_s16(u00, vdupq_n_s16(UB)); 195 | #endif 196 | 197 | uint16x8x2_t xR = vzipq_u16(vreinterpretq_u16_s16(dR), vreinterpretq_u16_s16(dR)); 198 | uint16x8x2_t xG = vzipq_u16(vreinterpretq_u16_s16(dG), vreinterpretq_u16_s16(dG)); 199 | uint16x8x2_t xB = vzipq_u16(vreinterpretq_u16_s16(dB), vreinterpretq_u16_s16(dB)); 200 | 201 | uint8x16x4_t t; 202 | uint8x16x4_t b; 203 | 204 | t.val[iR] = vcombine_u8(vqmovun_s16(vaddw_u8(xR.val[0], y00)), vqmovun_s16(vaddw_u8(xR.val[1], y01))); 205 | t.val[iG] = vcombine_u8(vqmovun_s16(vaddw_u8(xG.val[0], y00)), vqmovun_s16(vaddw_u8(xG.val[1], y01))); 206 | t.val[iB] = vcombine_u8(vqmovun_s16(vaddw_u8(xB.val[0], y00)), vqmovun_s16(vaddw_u8(xB.val[1], y01))); 207 | t.val[iA] = vdupq_n_u8(255); 208 | b.val[iR] = vcombine_u8(vqmovun_s16(vaddw_u8(xR.val[0], y10)), vqmovun_s16(vaddw_u8(xR.val[1], y11))); 209 | b.val[iG] = vcombine_u8(vqmovun_s16(vaddw_u8(xG.val[0], y10)), vqmovun_s16(vaddw_u8(xG.val[1], y11))); 210 | b.val[iB] = vcombine_u8(vqmovun_s16(vaddw_u8(xB.val[0], y10)), vqmovun_s16(vaddw_u8(xB.val[1], y11))); 211 | b.val[iA] = vdupq_n_u8(255); 212 | 213 | vst4q_u8(rgb0, t); rgb0 += 16 * 4; 214 | vst4q_u8(rgb1, b); rgb1 += 16 * 4; 215 | } 216 | if (componentRGB == 4) 217 | continue; 218 | #elif HAVE_AVX512 219 | int halfWidth16 = (componentRGB == 4) ? halfWidth / 32 : 0; 220 | for (int w = 0; w < halfWidth16; ++w) 221 | { 222 | __m512i y00lh = _mm512_loadu_si512((__m512i*)y0); y0 += 64; 223 | __m512i y10lh = _mm512_loadu_si512((__m512i*)y1); y1 += 64; 224 | __m512i y00; 225 | __m512i y01; 226 | __m512i y10; 227 | __m512i y11; 228 | if (videoRange) 229 | { 230 | y00lh = _mm512_subs_epu8(y00lh, _mm512_set1_epi8(16)); 231 | y10lh = _mm512_subs_epu8(y10lh, _mm512_set1_epi8(16)); 232 | y00 = _mm512_mulhi_epu16(_mm512_unpacklo_epi8(__m512i(), y00lh), _mm512_set1_epi16(Y)); 233 | y01 = _mm512_mulhi_epu16(_mm512_unpackhi_epi8(__m512i(), y00lh), _mm512_set1_epi16(Y)); 234 | y10 = _mm512_mulhi_epu16(_mm512_unpacklo_epi8(__m512i(), y10lh), _mm512_set1_epi16(Y)); 235 | y11 = _mm512_mulhi_epu16(_mm512_unpackhi_epi8(__m512i(), y10lh), _mm512_set1_epi16(Y)); 236 | } 237 | else 238 | { 239 | y00 = _mm512_unpacklo_epi8(y00lh, __m512i()); 240 | y01 = _mm512_unpackhi_epi8(y00lh, __m512i()); 241 | y10 = _mm512_unpacklo_epi8(y10lh, __m512i()); 242 | y11 = _mm512_unpackhi_epi8(y10lh, __m512i()); 243 | } 244 | 245 | __m512i u00; 246 | __m512i v00; 247 | if (interleaved) 248 | { 249 | if (firstU) 250 | { 251 | __m512i uv00 = _mm512_loadu_si512((__m512i*)u0); u0 += 64; 252 | uv00 = _mm512_sub_epi8(uv00, _mm512_set1_epi8(-128)); 253 | u00 = _mm512_slli_epi16(uv00, 8); 254 | v00 = uv00; 255 | } 256 | else 257 | { 258 | __m512i uv00 = _mm512_loadu_si512((__m512i*)v0); v0 += 64; 259 | uv00 = _mm512_sub_epi8(uv00, _mm512_set1_epi8(-128)); 260 | u00 = uv00; 261 | v00 = _mm512_slli_epi16(uv00, 8); 262 | } 263 | } 264 | else 265 | { 266 | __m256i u000 = _mm256_loadu_si256((__m256i*)u0); u0 += 32; 267 | __m256i v000 = _mm256_loadu_si256((__m256i*)v0); v0 += 32; 268 | u000 = _mm256_sub_epi8(u000, _mm256_set1_epi8(-128)); 269 | v000 = _mm256_sub_epi8(v000, _mm256_set1_epi8(-128)); 270 | u00 = _mm512_slli_epi16(_mm512_cvtepi8_epi16(u000), 8); 271 | v00 = _mm512_slli_epi16(_mm512_cvtepi8_epi16(v000), 8); 272 | } 273 | 274 | __m512i dR = _mm512_mulhi_epi16(v00, _mm512_set1_epi16(VR)); 275 | __m512i dG = _mm512_add_epi16(_mm512_mulhi_epi16(u00, _mm512_set1_epi16(UG)), _mm512_mulhi_epi16(v00, _mm512_set1_epi16(VG))); 276 | __m512i dB = _mm512_mulhi_epi16(u00, _mm512_set1_epi16(UB)); 277 | 278 | __m512i xR[2] = { _mm512_unpacklo_epi16(dR, dR), _mm512_unpackhi_epi16(dR, dR) }; 279 | __m512i xG[2] = { _mm512_unpacklo_epi16(dG, dG), _mm512_unpackhi_epi16(dG, dG) }; 280 | __m512i xB[2] = { _mm512_unpacklo_epi16(dB, dB), _mm512_unpackhi_epi16(dB, dB) }; 281 | 282 | __m512i t[4]; 283 | __m512i b[4]; 284 | 285 | t[iR] = _mm512_packus_epi16(_mm512_add_epi16(y00, xR[0]), _mm512_add_epi16(y01, xR[1])); 286 | t[iG] = _mm512_packus_epi16(_mm512_add_epi16(y00, xG[0]), _mm512_add_epi16(y01, xG[1])); 287 | t[iB] = _mm512_packus_epi16(_mm512_add_epi16(y00, xB[0]), _mm512_add_epi16(y01, xB[1])); 288 | t[iA] = _mm512_set1_epi8(-1); 289 | b[iR] = _mm512_packus_epi16(_mm512_add_epi16(y10, xR[0]), _mm512_add_epi16(y11, xR[1])); 290 | b[iG] = _mm512_packus_epi16(_mm512_add_epi16(y10, xG[0]), _mm512_add_epi16(y11, xG[1])); 291 | b[iB] = _mm512_packus_epi16(_mm512_add_epi16(y10, xB[0]), _mm512_add_epi16(y11, xB[1])); 292 | b[iA] = _mm512_set1_epi8(-1); 293 | 294 | _MM512_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]); 295 | _MM512_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]); 296 | _MM512_TRANSPOSE4_SI128(t[0], t[1], t[2], t[3]); 297 | _MM512_TRANSPOSE4_SI128(b[0], b[1], b[2], b[3]); 298 | 299 | _mm512_storeu_si512((__m512i*)rgb0 + 0, t[0]); 300 | _mm512_storeu_si512((__m512i*)rgb0 + 1, t[1]); 301 | _mm512_storeu_si512((__m512i*)rgb0 + 2, t[2]); 302 | _mm512_storeu_si512((__m512i*)rgb0 + 3, t[3]); rgb0 += 16 * 16; 303 | _mm512_storeu_si512((__m512i*)rgb1 + 0, b[0]); 304 | _mm512_storeu_si512((__m512i*)rgb1 + 1, b[1]); 305 | _mm512_storeu_si512((__m512i*)rgb1 + 2, b[2]); 306 | _mm512_storeu_si512((__m512i*)rgb1 + 3, b[3]); rgb1 += 16 * 16; 307 | } 308 | if (componentRGB == 4) 309 | continue; 310 | #elif HAVE_AVX2 311 | int halfWidth16 = (componentRGB == 4) ? halfWidth / 16 : 0; 312 | for (int w = 0; w < halfWidth16; ++w) 313 | { 314 | __m256i y00lh = _mm256_loadu_si256((__m256i*)y0); y0 += 32; 315 | __m256i y10lh = _mm256_loadu_si256((__m256i*)y1); y1 += 32; 316 | __m256i y00; 317 | __m256i y01; 318 | __m256i y10; 319 | __m256i y11; 320 | if (videoRange) 321 | { 322 | y00lh = _mm256_subs_epu8(y00lh, _mm256_set1_epi8(16)); 323 | y10lh = _mm256_subs_epu8(y10lh, _mm256_set1_epi8(16)); 324 | y00 = _mm256_mulhi_epu16(_mm256_unpacklo_epi8(__m256i(), y00lh), _mm256_set1_epi16(Y)); 325 | y01 = _mm256_mulhi_epu16(_mm256_unpackhi_epi8(__m256i(), y00lh), _mm256_set1_epi16(Y)); 326 | y10 = _mm256_mulhi_epu16(_mm256_unpacklo_epi8(__m256i(), y10lh), _mm256_set1_epi16(Y)); 327 | y11 = _mm256_mulhi_epu16(_mm256_unpackhi_epi8(__m256i(), y10lh), _mm256_set1_epi16(Y)); 328 | } 329 | else 330 | { 331 | y00 = _mm256_unpacklo_epi8(y00lh, __m256i()); 332 | y01 = _mm256_unpackhi_epi8(y00lh, __m256i()); 333 | y10 = _mm256_unpacklo_epi8(y10lh, __m256i()); 334 | y11 = _mm256_unpackhi_epi8(y10lh, __m256i()); 335 | } 336 | 337 | __m256i u00; 338 | __m256i v00; 339 | if (interleaved) 340 | { 341 | if (firstU) 342 | { 343 | __m256i uv00 = _mm256_loadu_si256((__m256i*)u0); u0 += 32; 344 | uv00 = _mm256_sub_epi8(uv00, _mm256_set1_epi8(-128)); 345 | u00 = _mm256_slli_epi16(uv00, 8); 346 | v00 = uv00; 347 | } 348 | else 349 | { 350 | __m256i uv00 = _mm256_loadu_si256((__m256i*)v0); v0 += 32; 351 | uv00 = _mm256_sub_epi8(uv00, _mm256_set1_epi8(-128)); 352 | u00 = uv00; 353 | v00 = _mm256_slli_epi16(uv00, 8); 354 | } 355 | } 356 | else 357 | { 358 | __m128i u000 = _mm_loadu_si128((__m128i*)u0); u0 += 16; 359 | __m128i v000 = _mm_loadu_si128((__m128i*)v0); v0 += 16; 360 | u000 = _mm_sub_epi8(u000, _mm_set1_epi8(-128)); 361 | v000 = _mm_sub_epi8(v000, _mm_set1_epi8(-128)); 362 | u00 = _mm256_slli_epi16(_mm256_cvtepi8_epi16(u000), 8); 363 | v00 = _mm256_slli_epi16(_mm256_cvtepi8_epi16(v000), 8); 364 | } 365 | 366 | __m256i dR = _mm256_mulhi_epi16(v00, _mm256_set1_epi16(VR)); 367 | __m256i dG = _mm256_add_epi16(_mm256_mulhi_epi16(u00, _mm256_set1_epi16(UG)), _mm256_mulhi_epi16(v00, _mm256_set1_epi16(VG))); 368 | __m256i dB = _mm256_mulhi_epi16(u00, _mm256_set1_epi16(UB)); 369 | 370 | __m256i xR[2] = { _mm256_unpacklo_epi16(dR, dR), _mm256_unpackhi_epi16(dR, dR) }; 371 | __m256i xG[2] = { _mm256_unpacklo_epi16(dG, dG), _mm256_unpackhi_epi16(dG, dG) }; 372 | __m256i xB[2] = { _mm256_unpacklo_epi16(dB, dB), _mm256_unpackhi_epi16(dB, dB) }; 373 | 374 | __m256i t[4]; 375 | __m256i b[4]; 376 | 377 | t[iR] = _mm256_packus_epi16(_mm256_add_epi16(y00, xR[0]), _mm256_add_epi16(y01, xR[1])); 378 | t[iG] = _mm256_packus_epi16(_mm256_add_epi16(y00, xG[0]), _mm256_add_epi16(y01, xG[1])); 379 | t[iB] = _mm256_packus_epi16(_mm256_add_epi16(y00, xB[0]), _mm256_add_epi16(y01, xB[1])); 380 | t[iA] = _mm256_set1_epi8(-1); 381 | b[iR] = _mm256_packus_epi16(_mm256_add_epi16(y10, xR[0]), _mm256_add_epi16(y11, xR[1])); 382 | b[iG] = _mm256_packus_epi16(_mm256_add_epi16(y10, xG[0]), _mm256_add_epi16(y11, xG[1])); 383 | b[iB] = _mm256_packus_epi16(_mm256_add_epi16(y10, xB[0]), _mm256_add_epi16(y11, xB[1])); 384 | b[iA] = _mm256_set1_epi8(-1); 385 | 386 | _MM256_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]); 387 | _MM256_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]); 388 | _MM256_TRANSPOSE4_SI128(t[0], t[1], t[2], t[3]); 389 | _MM256_TRANSPOSE4_SI128(b[0], b[1], b[2], b[3]); 390 | 391 | _mm256_storeu_si256((__m256i*)rgb0 + 0, t[0]); 392 | _mm256_storeu_si256((__m256i*)rgb0 + 1, t[1]); 393 | _mm256_storeu_si256((__m256i*)rgb0 + 2, t[2]); 394 | _mm256_storeu_si256((__m256i*)rgb0 + 3, t[3]); rgb0 += 16 * 8; 395 | _mm256_storeu_si256((__m256i*)rgb1 + 0, b[0]); 396 | _mm256_storeu_si256((__m256i*)rgb1 + 1, b[1]); 397 | _mm256_storeu_si256((__m256i*)rgb1 + 2, b[2]); 398 | _mm256_storeu_si256((__m256i*)rgb1 + 3, b[3]); rgb1 += 16 * 8; 399 | } 400 | if (componentRGB == 4) 401 | continue; 402 | #elif HAVE_SSE2 403 | int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0; 404 | for (int w = 0; w < halfWidth8; ++w) 405 | { 406 | __m128i y00lh = _mm_loadu_si128((__m128i*)y0); y0 += 16; 407 | __m128i y10lh = _mm_loadu_si128((__m128i*)y1); y1 += 16; 408 | __m128i y00; 409 | __m128i y01; 410 | __m128i y10; 411 | __m128i y11; 412 | if (videoRange) 413 | { 414 | y00lh = _mm_subs_epu8(y00lh, _mm_set1_epi8(16)); 415 | y10lh = _mm_subs_epu8(y10lh, _mm_set1_epi8(16)); 416 | y00 = _mm_mulhi_epu16(_mm_unpacklo_epi8(__m128i(), y00lh), _mm_set1_epi16(Y)); 417 | y01 = _mm_mulhi_epu16(_mm_unpackhi_epi8(__m128i(), y00lh), _mm_set1_epi16(Y)); 418 | y10 = _mm_mulhi_epu16(_mm_unpacklo_epi8(__m128i(), y10lh), _mm_set1_epi16(Y)); 419 | y11 = _mm_mulhi_epu16(_mm_unpackhi_epi8(__m128i(), y10lh), _mm_set1_epi16(Y)); 420 | } 421 | else 422 | { 423 | y00 = _mm_unpacklo_epi8(y00lh, __m128i()); 424 | y01 = _mm_unpackhi_epi8(y00lh, __m128i()); 425 | y10 = _mm_unpacklo_epi8(y10lh, __m128i()); 426 | y11 = _mm_unpackhi_epi8(y10lh, __m128i()); 427 | } 428 | 429 | __m128i u00; 430 | __m128i v00; 431 | if (interleaved) 432 | { 433 | if (firstU) 434 | { 435 | __m128i uv00 = _mm_loadu_si128((__m128i*)u0); u0 += 16; 436 | uv00 = _mm_sub_epi8(uv00, _mm_set1_epi8(-128)); 437 | u00 = _mm_slli_epi16(uv00, 8); 438 | v00 = uv00; 439 | } 440 | else 441 | { 442 | __m128i uv00 = _mm_loadu_si128((__m128i*)v0); v0 += 16; 443 | uv00 = _mm_sub_epi8(uv00, _mm_set1_epi8(-128)); 444 | u00 = uv00; 445 | v00 = _mm_slli_epi16(uv00, 8); 446 | } 447 | } 448 | else 449 | { 450 | __m128i u000 = _mm_loadl_epi64((__m128i*)u0); u0 += 8; 451 | __m128i v000 = _mm_loadl_epi64((__m128i*)v0); v0 += 8; 452 | u000 = _mm_sub_epi8(u000, _mm_set1_epi8(-128)); 453 | v000 = _mm_sub_epi8(v000, _mm_set1_epi8(-128)); 454 | u00 = _mm_unpacklo_epi8(__m128i(), u000); 455 | v00 = _mm_unpacklo_epi8(__m128i(), v000); 456 | } 457 | 458 | __m128i dR = _mm_mulhi_epi16(v00, _mm_set1_epi16(VR)); 459 | __m128i dG = _mm_add_epi16(_mm_mulhi_epi16(u00, _mm_set1_epi16(UG)), _mm_mulhi_epi16(v00, _mm_set1_epi16(VG))); 460 | __m128i dB = _mm_mulhi_epi16(u00, _mm_set1_epi16(UB)); 461 | 462 | __m128i xR[2] = { _mm_unpacklo_epi16(dR, dR), _mm_unpackhi_epi16(dR, dR) }; 463 | __m128i xG[2] = { _mm_unpacklo_epi16(dG, dG), _mm_unpackhi_epi16(dG, dG) }; 464 | __m128i xB[2] = { _mm_unpacklo_epi16(dB, dB), _mm_unpackhi_epi16(dB, dB) }; 465 | 466 | __m128i t[4]; 467 | __m128i b[4]; 468 | 469 | t[iR] = _mm_packus_epi16(_mm_add_epi16(y00, xR[0]), _mm_add_epi16(y01, xR[1])); 470 | t[iG] = _mm_packus_epi16(_mm_add_epi16(y00, xG[0]), _mm_add_epi16(y01, xG[1])); 471 | t[iB] = _mm_packus_epi16(_mm_add_epi16(y00, xB[0]), _mm_add_epi16(y01, xB[1])); 472 | t[iA] = _mm_set1_epi8(-1); 473 | b[iR] = _mm_packus_epi16(_mm_add_epi16(y10, xR[0]), _mm_add_epi16(y11, xR[1])); 474 | b[iG] = _mm_packus_epi16(_mm_add_epi16(y10, xG[0]), _mm_add_epi16(y11, xG[1])); 475 | b[iB] = _mm_packus_epi16(_mm_add_epi16(y10, xB[0]), _mm_add_epi16(y11, xB[1])); 476 | b[iA] = _mm_set1_epi8(-1); 477 | 478 | _MM_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]); 479 | _MM_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]); 480 | 481 | _mm_storeu_si128((__m128i*)rgb0 + 0, t[0]); 482 | _mm_storeu_si128((__m128i*)rgb0 + 1, t[1]); 483 | _mm_storeu_si128((__m128i*)rgb0 + 2, t[2]); 484 | _mm_storeu_si128((__m128i*)rgb0 + 3, t[3]); rgb0 += 16 * 4; 485 | _mm_storeu_si128((__m128i*)rgb1 + 0, b[0]); 486 | _mm_storeu_si128((__m128i*)rgb1 + 1, b[1]); 487 | _mm_storeu_si128((__m128i*)rgb1 + 2, b[2]); 488 | _mm_storeu_si128((__m128i*)rgb1 + 3, b[3]); rgb1 += 16 * 4; 489 | } 490 | if (componentRGB == 4) 491 | continue; 492 | #endif 493 | for (int w = 0; w < halfWidth; ++w) 494 | { 495 | int y00 = (*y0++); 496 | int y01 = (*y0++); 497 | int y10 = (*y1++); 498 | int y11 = (*y1++); 499 | if (videoRange) 500 | { 501 | y00 = ((y00 - 16) * Y) >> 8; 502 | y01 = ((y01 - 16) * Y) >> 8; 503 | y10 = ((y10 - 16) * Y) >> 8; 504 | y11 = ((y11 - 16) * Y) >> 8; 505 | } 506 | 507 | int u00 = (*u0++) - 128; 508 | int v00 = (*v0++) - 128; 509 | if (interleaved) 510 | { 511 | u0++; 512 | v0++; 513 | } 514 | 515 | int dR = ( v00 * VR) >> 8; 516 | int dG = (u00 * UG + v00 * VG) >> 8; 517 | int dB = (u00 * UB ) >> 8; 518 | 519 | auto clamp = [](int value) -> unsigned char 520 | { 521 | return (unsigned char)(value < 255 ? value < 0 ? 0 : value : 255); 522 | }; 523 | 524 | if (componentRGB >= 1) rgb0[iR] = clamp(y00 + dR); 525 | if (componentRGB >= 2) rgb0[iG] = clamp(y00 + dG); 526 | if (componentRGB >= 3) rgb0[iB] = clamp(y00 + dB); 527 | if (componentRGB >= 4) rgb0[iA] = 255; 528 | rgb0 += componentRGB; 529 | 530 | if (componentRGB >= 1) rgb0[iR] = clamp(y01 + dR); 531 | if (componentRGB >= 2) rgb0[iG] = clamp(y01 + dG); 532 | if (componentRGB >= 3) rgb0[iB] = clamp(y01 + dB); 533 | if (componentRGB >= 4) rgb0[iA] = 255; 534 | rgb0 += componentRGB; 535 | 536 | if (componentRGB >= 1) rgb1[iR] = clamp(y10 + dR); 537 | if (componentRGB >= 2) rgb1[iG] = clamp(y10 + dG); 538 | if (componentRGB >= 3) rgb1[iB] = clamp(y10 + dB); 539 | if (componentRGB >= 4) rgb1[iA] = 255; 540 | rgb1 += componentRGB; 541 | 542 | if (componentRGB >= 1) rgb1[iR] = clamp(y11 + dR); 543 | if (componentRGB >= 2) rgb1[iG] = clamp(y11 + dG); 544 | if (componentRGB >= 3) rgb1[iB] = clamp(y11 + dB); 545 | if (componentRGB >= 4) rgb1[iA] = 255; 546 | rgb1 += componentRGB; 547 | } 548 | } 549 | } 550 | //------------------------------------------------------------------------------ 551 | #ifndef yuv2rgb_select 552 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 553 | yuv2rgb 554 | #endif 555 | //------------------------------------------------------------------------------ 556 | #ifndef yuv2rgb 557 | //------------------------------------------------------------------------------ 558 | #if defined(__llvm__) 559 | #define rgb2yuv_attribute(value) __attribute__((target(value))) 560 | #else 561 | #define rgb2yuv_attribute(value) 562 | #endif 563 | //------------------------------------------------------------------------------ 564 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) 565 | #define HAVE_NEON 1 566 | #define yuv2rgb rgb2yuv_attribute("neon") yuv2rgb_neon 567 | #include "yuv2rgb.inl" 568 | #undef yuv2rgb 569 | #undef HAVE_NEON 570 | #undef yuv2rgb_select 571 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 572 | neon() ? yuv2rgb_neon : \ 573 | yuv2rgb 574 | #endif 575 | //------------------------------------------------------------------------------ 576 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 577 | #define HAVE_SSE2 1 578 | #define yuv2rgb rgb2yuv_attribute("sse2") yuv2rgb_sse2 579 | #include "yuv2rgb.inl" 580 | #undef yuv2rgb 581 | #undef HAVE_SSE2 582 | #undef yuv2rgb_select 583 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 584 | sse2() ? yuv2rgb_sse2 : \ 585 | yuv2rgb 586 | #endif 587 | //------------------------------------------------------------------------------ 588 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 589 | #define HAVE_AVX2 1 590 | #define yuv2rgb rgb2yuv_attribute("avx2") yuv2rgb_avx2 591 | #include "yuv2rgb.inl" 592 | #undef yuv2rgb 593 | #undef HAVE_AVX2 594 | #undef yuv2rgb_select 595 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \ 596 | avx2() ? yuv2rgb_avx2 : \ 597 | sse2() ? yuv2rgb_sse2 : \ 598 | yuv2rgb 599 | #endif 600 | //------------------------------------------------------------------------------ 601 | #endif 602 | //------------------------------------------------------------------------------ 603 | -------------------------------------------------------------------------------- /yuv2rgb_amx.cpp: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2rgb_amx Source 3 | // 4 | // Copyright (c) 2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #if defined(__llvm__) 8 | # pragma clang diagnostic ignored "-Wunused-variable" 9 | #endif 10 | #include "apple_amx.h" 11 | #include "yuv2rgb.h" 12 | 13 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1)) 14 | 15 | // BT.709 - Video Range 16 | // Y U V 17 | // R = 1.164384 0.000000 1.792741 18 | // G = 1.164384 -0.213249 -0.532909 19 | // B = 1.164384 2.112402 0.000000 20 | // 21 | // BT.709 - Full Range 22 | // Y U V 23 | // R = 1.000000 0.000000 1.581000 24 | // G = 1.000000 -0.188062 -0.469967 25 | // B = 1.000000 1.862906 0.000000 26 | #define vY 1.164384 27 | #define vUG -0.213249 28 | #define vUB 2.112402 29 | #define vVR 1.792741 30 | #define vVG -0.532909 31 | #define fY 1.000000 32 | #define fUG -0.188062 33 | #define fUB 1.862906 34 | #define fVR 1.581000 35 | #define fVG -0.469967 36 | 37 | //------------------------------------------------------------------------------ 38 | template 39 | void yuv2rgb_amx(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB) 40 | { 41 | #if defined(__APPLE__) && defined(__aarch64__) 42 | if (strideRGB < 0) 43 | { 44 | rgb = (char*)rgb - (strideRGB * (height - 1)); 45 | } 46 | 47 | int halfWidth = width >> 1; 48 | int halfHeight = height >> 1; 49 | 50 | int iR = rgbSwizzle ? 2 : 0; 51 | int iG = 1; 52 | int iB = rgbSwizzle ? 0 : 2; 53 | int iA = 3; 54 | 55 | static constexpr int16_t Y = fullRange ? (int)(fY * 256) : (int)(vY * 256); 56 | static constexpr int16_t UG = fullRange ? (int)(fUG * 255) : (int)(vUG * 255); 57 | static constexpr int16_t UB = fullRange ? (int)(fUB * 255) : (int)(vUB * 255); 58 | static constexpr int16_t VR = fullRange ? (int)(fVR * 255) : (int)(vVR * 255); 59 | static constexpr int16_t VG = fullRange ? (int)(fVG * 255) : (int)(vVG * 255); 60 | 61 | static constexpr int16_t vector256[32] = { [0 ... 31] = 256 }; 62 | static constexpr int16_t vectorN128[32] = { [0 ... 31] = -128 }; 63 | static constexpr int16_t vectorY[32] = { [0 ... 31] = (int16_t)(Y >> 1) }; 64 | static constexpr int16_t vectorVR[32] = { [0 ... 31] = (int16_t)(VR >> 2) }; 65 | static constexpr int16_t vectorUG[32] = { [0 ... 31] = (int16_t)(UG >> 1) }; 66 | static constexpr int16_t vectorVG[32] = { [0 ... 31] = (int16_t)(VG >> 1) }; 67 | static constexpr int16_t vectorUB[32] = { [0 ... 31] = (int16_t)(UB >> 3) }; 68 | 69 | amx_set(); 70 | amx_ldy( /*.memory_offset = */(uint64_t)vector256, .register_index = 1 ); 71 | amx_ldy( /*.memory_offset = */(uint64_t)vectorN128, .register_index = 2 ); 72 | amx_ldy( /*.memory_offset = */(uint64_t)vectorY, .register_index = 3 ); 73 | amx_ldy( /*.memory_offset = */(uint64_t)vectorVR, .register_index = 4 ); 74 | amx_ldy( /*.memory_offset = */(uint64_t)vectorUG, .register_index = 5 ); 75 | amx_ldy( /*.memory_offset = */(uint64_t)vectorVG, .register_index = 6 ); 76 | amx_ldy( /*.memory_offset = */(uint64_t)vectorUB, .register_index = 7 ); 77 | for (int h = 0; h < halfHeight; ++h) 78 | { 79 | const unsigned char* y0 = (unsigned char*)y; 80 | const unsigned char* y1 = y0 + strideY; y = y1 + strideY; 81 | const unsigned char* u0 = (unsigned char*)u; u = u0 + strideU; 82 | const unsigned char* v0 = (unsigned char*)v; v = v0 + strideV; 83 | unsigned char* rgb0 = (unsigned char*)rgb; 84 | unsigned char* rgb1 = rgb0 + strideRGB; rgb = rgb1 + strideRGB; 85 | int halfWidth128 = width / 128; 86 | for (int w = 0; w < halfWidth128; ++w) 87 | { 88 | // Clear 89 | amx_mac16( .skip_x = 1, .skip_y = 1, .skip_z = 1, .mode_32 = 1 ); 90 | 91 | // Load 92 | amx_ldx( /*.memory_offset = */(uint64_t)y0 + 0, .register_index = 0 ); 93 | amx_ldx( /*.memory_offset = */(uint64_t)y0 + 64, .register_index = 1 ); y0 += 128; 94 | amx_ldx( /*.memory_offset = */(uint64_t)y1 + 0, .register_index = 2 ); 95 | amx_ldx( /*.memory_offset = */(uint64_t)y1 + 64, .register_index = 3 ); y1 += 128; 96 | amx_ldx( /*.memory_offset = */(uint64_t)u0 + 0, .register_index = 4 ); u0 += 64; 97 | amx_ldx( /*.memory_offset = */(uint64_t)v0 + 0, .register_index = 5 ); v0 += 64; 98 | 99 | // Y 100 | amx_vecint( .offset_x = 0x000, .offset_y = 0x0C0, .offset_z = 32, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 101 | amx_vecint( .offset_x = 0x000, .offset_y = 0x0C0, .offset_z = 40, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 102 | amx_vecint( .offset_x = 0x000, .offset_y = 0x0C0, .offset_z = 48, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 103 | amx_vecint( .offset_x = 0x040, .offset_y = 0x0C0, .offset_z = 34, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 104 | amx_vecint( .offset_x = 0x040, .offset_y = 0x0C0, .offset_z = 42, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 105 | amx_vecint( .offset_x = 0x040, .offset_y = 0x0C0, .offset_z = 50, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 106 | amx_vecint( .offset_x = 0x080, .offset_y = 0x0C0, .offset_z = 36, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 107 | amx_vecint( .offset_x = 0x080, .offset_y = 0x0C0, .offset_z = 44, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 108 | amx_vecint( .offset_x = 0x080, .offset_y = 0x0C0, .offset_z = 52, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 109 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x0C0, .offset_z = 38, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 110 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x0C0, .offset_z = 46, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 111 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x0C0, .offset_z = 54, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 ); 112 | 113 | // UV 114 | amx_vecint( .offset_x = 0x100, .offset_y = 0x080, .offset_z = 56, .count_x = 2, .extended = 12, .add = 1 ); 115 | amx_vecint( .offset_x = 0x140, .offset_y = 0x080, .offset_z = 60, .count_x = 2, .extended = 12, .add = 1 ); 116 | amx_extrx( .offset_x = 0x000, .offset_z = 56 ); 117 | amx_extrx( .offset_x = 0x040, .offset_z = 57 ); 118 | amx_extrx( .offset_x = 0x080, .offset_z = 58 ); 119 | amx_extrx( .offset_x = 0x0C0, .offset_z = 59 ); 120 | amx_extrx( .offset_x = 0x100, .offset_z = 60 ); 121 | amx_extrx( .offset_x = 0x140, .offset_z = 61 ); 122 | amx_extrx( .offset_x = 0x180, .offset_z = 62 ); 123 | amx_extrx( .offset_x = 0x1C0, .offset_z = 63 ); 124 | amx_vecint( .offset_x = 0x200 - 2, .offset_y = 0, .offset_z = 56, .add = 1 ); 125 | amx_vecint( .offset_x = 0x040 - 2, .offset_y = 0, .offset_z = 57, .add = 1 ); 126 | amx_vecint( .offset_x = 0x080 - 2, .offset_y = 0, .offset_z = 58, .add = 1 ); 127 | amx_vecint( .offset_x = 0x0C0 - 2, .offset_y = 0, .offset_z = 59, .add = 1 ); 128 | amx_vecint( .offset_x = 0x100 - 2, .offset_y = 0, .offset_z = 60, .add = 1 ); 129 | amx_vecint( .offset_x = 0x140 - 2, .offset_y = 0, .offset_z = 61, .add = 1 ); 130 | amx_vecint( .offset_x = 0x180 - 2, .offset_y = 0, .offset_z = 62, .add = 1 ); 131 | amx_vecint( .offset_x = 0x1C0 - 2, .offset_y = 0, .offset_z = 63, .add = 1 ); 132 | amx_extrx( .offset_x = 0x000, .offset_z = 56 ); 133 | amx_extrx( .offset_x = 0x040, .offset_z = 57 ); 134 | amx_extrx( .offset_x = 0x080, .offset_z = 58 ); 135 | amx_extrx( .offset_x = 0x0C0, .offset_z = 59 ); 136 | amx_extrx( .offset_x = 0x100, .offset_z = 60 ); 137 | amx_extrx( .offset_x = 0x140, .offset_z = 61 ); 138 | amx_extrx( .offset_x = 0x180, .offset_z = 62 ); 139 | amx_extrx( .offset_x = 0x1C0, .offset_z = 63 ); 140 | amx_vecint( .offset_x = 0x000, .offset_y = 0x140, .offset_z = 40, .shift_right = 7 ); 141 | amx_vecint( .offset_x = 0x000, .offset_y = 0x140, .offset_z = 44, .shift_right = 7 ); 142 | amx_vecint( .offset_x = 0x040, .offset_y = 0x140, .offset_z = 41, .shift_right = 7 ); 143 | amx_vecint( .offset_x = 0x040, .offset_y = 0x140, .offset_z = 45, .shift_right = 7 ); 144 | amx_vecint( .offset_x = 0x080, .offset_y = 0x140, .offset_z = 42, .shift_right = 7 ); 145 | amx_vecint( .offset_x = 0x080, .offset_y = 0x140, .offset_z = 46, .shift_right = 7 ); 146 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x140, .offset_z = 43, .shift_right = 7 ); 147 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x140, .offset_z = 47, .shift_right = 7 ); 148 | amx_vecint( .offset_x = 0x000, .offset_y = 0x1C0, .offset_z = 48, .shift_right = 5 ); 149 | amx_vecint( .offset_x = 0x000, .offset_y = 0x1C0, .offset_z = 52, .shift_right = 5 ); 150 | amx_vecint( .offset_x = 0x040, .offset_y = 0x1C0, .offset_z = 49, .shift_right = 5 ); 151 | amx_vecint( .offset_x = 0x040, .offset_y = 0x1C0, .offset_z = 53, .shift_right = 5 ); 152 | amx_vecint( .offset_x = 0x080, .offset_y = 0x1C0, .offset_z = 50, .shift_right = 5 ); 153 | amx_vecint( .offset_x = 0x080, .offset_y = 0x1C0, .offset_z = 54, .shift_right = 5 ); 154 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x1C0, .offset_z = 51, .shift_right = 5 ); 155 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x1C0, .offset_z = 55, .shift_right = 5 ); 156 | amx_vecint( .offset_x = 0x100, .offset_y = 0x100, .offset_z = 32, .shift_right = 6 ); 157 | amx_vecint( .offset_x = 0x100, .offset_y = 0x100, .offset_z = 36, .shift_right = 6 ); 158 | amx_vecint( .offset_x = 0x140, .offset_y = 0x100, .offset_z = 33, .shift_right = 6 ); 159 | amx_vecint( .offset_x = 0x140, .offset_y = 0x100, .offset_z = 37, .shift_right = 6 ); 160 | amx_vecint( .offset_x = 0x180, .offset_y = 0x100, .offset_z = 34, .shift_right = 6 ); 161 | amx_vecint( .offset_x = 0x180, .offset_y = 0x100, .offset_z = 38, .shift_right = 6 ); 162 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x100, .offset_z = 35, .shift_right = 6 ); 163 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x100, .offset_z = 39, .shift_right = 6 ); 164 | amx_vecint( .offset_x = 0x100, .offset_y = 0x180, .offset_z = 40, .shift_right = 7 ); 165 | amx_vecint( .offset_x = 0x100, .offset_y = 0x180, .offset_z = 44, .shift_right = 7 ); 166 | amx_vecint( .offset_x = 0x140, .offset_y = 0x180, .offset_z = 41, .shift_right = 7 ); 167 | amx_vecint( .offset_x = 0x140, .offset_y = 0x180, .offset_z = 45, .shift_right = 7 ); 168 | amx_vecint( .offset_x = 0x180, .offset_y = 0x180, .offset_z = 42, .shift_right = 7 ); 169 | amx_vecint( .offset_x = 0x180, .offset_y = 0x180, .offset_z = 46, .shift_right = 7 ); 170 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x180, .offset_z = 43, .shift_right = 7 ); 171 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x180, .offset_z = 47, .shift_right = 7 ); 172 | 173 | // RGBA 174 | if (iR == 0) 175 | { 176 | amx_extrx( .offset_x = 0x000, .offset_z = 32 ); 177 | amx_extrx( .offset_x = 0x040, .offset_z = 33 ); 178 | amx_extrx( .offset_x = 0x080, .offset_z = 34 ); 179 | amx_extrx( .offset_x = 0x0C0, .offset_z = 35 ); 180 | amx_extrx( .offset_x = 0x100, .offset_z = 36 ); 181 | amx_extrx( .offset_x = 0x140, .offset_z = 37 ); 182 | amx_extrx( .offset_x = 0x180, .offset_z = 38 ); 183 | amx_extrx( .offset_x = 0x1C0, .offset_z = 39 ); 184 | } 185 | else 186 | { 187 | amx_extrx( .offset_x = 0x000, .offset_z = 48 ); 188 | amx_extrx( .offset_x = 0x040, .offset_z = 49 ); 189 | amx_extrx( .offset_x = 0x080, .offset_z = 50 ); 190 | amx_extrx( .offset_x = 0x0C0, .offset_z = 51 ); 191 | amx_extrx( .offset_x = 0x100, .offset_z = 52 ); 192 | amx_extrx( .offset_x = 0x140, .offset_z = 53 ); 193 | amx_extrx( .offset_x = 0x180, .offset_z = 54 ); 194 | amx_extrx( .offset_x = 0x1C0, .offset_z = 55 ); 195 | } 196 | amx_vecint( .offset_x = 0x000, .offset_y = 0x000, .offset_z = 0, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 197 | amx_vecint( .offset_x = 0x040, .offset_y = 0x000, .offset_z = 4, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 198 | amx_vecint( .offset_x = 0x080, .offset_y = 0x000, .offset_z = 8, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 199 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x000, .offset_z = 12, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 200 | amx_vecint( .offset_x = 0x100, .offset_y = 0x000, .offset_z = 16, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 201 | amx_vecint( .offset_x = 0x140, .offset_y = 0x000, .offset_z = 20, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 202 | amx_vecint( .offset_x = 0x180, .offset_y = 0x000, .offset_z = 24, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 203 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x000, .offset_z = 28, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 ); 204 | amx_extrx( .offset_x = 0x000, .offset_z = 40 ); 205 | amx_extrx( .offset_x = 0x040, .offset_z = 41 ); 206 | amx_extrx( .offset_x = 0x080, .offset_z = 42 ); 207 | amx_extrx( .offset_x = 0x0C0, .offset_z = 43 ); 208 | amx_extrx( .offset_x = 0x100, .offset_z = 44 ); 209 | amx_extrx( .offset_x = 0x140, .offset_z = 45 ); 210 | amx_extrx( .offset_x = 0x180, .offset_z = 46 ); 211 | amx_extrx( .offset_x = 0x1C0, .offset_z = 47 ); 212 | amx_vecint( .offset_x = 0x000, .offset_y = 0x040, .offset_z = 0, .count_x = 1, .extended = 12 ); 213 | amx_vecint( .offset_x = 0x040, .offset_y = 0x040, .offset_z = 4, .count_x = 1, .extended = 12 ); 214 | amx_vecint( .offset_x = 0x080, .offset_y = 0x040, .offset_z = 8, .count_x = 1, .extended = 12 ); 215 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x040, .offset_z = 12, .count_x = 1, .extended = 12 ); 216 | amx_vecint( .offset_x = 0x100, .offset_y = 0x040, .offset_z = 16, .count_x = 1, .extended = 12 ); 217 | amx_vecint( .offset_x = 0x140, .offset_y = 0x040, .offset_z = 20, .count_x = 1, .extended = 12 ); 218 | amx_vecint( .offset_x = 0x180, .offset_y = 0x040, .offset_z = 24, .count_x = 1, .extended = 12 ); 219 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x040, .offset_z = 28, .count_x = 1, .extended = 12 ); 220 | if (iB == 2) 221 | { 222 | amx_extrx( .offset_x = 0x000, .offset_z = 48 ); 223 | amx_extrx( .offset_x = 0x040, .offset_z = 49 ); 224 | amx_extrx( .offset_x = 0x080, .offset_z = 50 ); 225 | amx_extrx( .offset_x = 0x0C0, .offset_z = 51 ); 226 | amx_extrx( .offset_x = 0x100, .offset_z = 52 ); 227 | amx_extrx( .offset_x = 0x140, .offset_z = 53 ); 228 | amx_extrx( .offset_x = 0x180, .offset_z = 54 ); 229 | amx_extrx( .offset_x = 0x1C0, .offset_z = 55 ); 230 | } 231 | else 232 | { 233 | amx_extrx( .offset_x = 0x000, .offset_z = 32 ); 234 | amx_extrx( .offset_x = 0x040, .offset_z = 33 ); 235 | amx_extrx( .offset_x = 0x080, .offset_z = 34 ); 236 | amx_extrx( .offset_x = 0x0C0, .offset_z = 35 ); 237 | amx_extrx( .offset_x = 0x100, .offset_z = 36 ); 238 | amx_extrx( .offset_x = 0x140, .offset_z = 37 ); 239 | amx_extrx( .offset_x = 0x180, .offset_z = 38 ); 240 | amx_extrx( .offset_x = 0x1C0, .offset_z = 39 ); 241 | } 242 | amx_vecint( .offset_x = 0x1E0, .offset_y = 0x040, .offset_z = 0, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 243 | amx_vecint( .offset_x = 0x000, .offset_y = 0x040, .offset_z = 1, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 244 | amx_vecint( .offset_x = 0x020, .offset_y = 0x040, .offset_z = 4, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 245 | amx_vecint( .offset_x = 0x040, .offset_y = 0x040, .offset_z = 5, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 246 | amx_vecint( .offset_x = 0x060, .offset_y = 0x040, .offset_z = 8, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 247 | amx_vecint( .offset_x = 0x080, .offset_y = 0x040, .offset_z = 9, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 248 | amx_vecint( .offset_x = 0x0A0, .offset_y = 0x040, .offset_z = 12, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 249 | amx_vecint( .offset_x = 0x0C0, .offset_y = 0x040, .offset_z = 13, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 250 | amx_vecint( .offset_x = 0x0E0, .offset_y = 0x040, .offset_z = 16, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 251 | amx_vecint( .offset_x = 0x100, .offset_y = 0x040, .offset_z = 17, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 252 | amx_vecint( .offset_x = 0x120, .offset_y = 0x040, .offset_z = 20, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 253 | amx_vecint( .offset_x = 0x140, .offset_y = 0x040, .offset_z = 21, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 254 | amx_vecint( .offset_x = 0x160, .offset_y = 0x040, .offset_z = 24, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 255 | amx_vecint( .offset_x = 0x180, .offset_y = 0x040, .offset_z = 25, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 256 | amx_vecint( .offset_x = 0x1A0, .offset_y = 0x040, .offset_z = 28, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 257 | amx_vecint( .offset_x = 0x1C0, .offset_y = 0x040, .offset_z = 29, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 ); 258 | 259 | // Store 260 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 0, .register_index = 0 ); 261 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 64, .register_index = 1 ); 262 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 128, .register_index = 4 ); 263 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 192, .register_index = 5 ); 264 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 256, .register_index = 8 ); 265 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 320, .register_index = 9 ); 266 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 384, .register_index = 12 ); 267 | amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 448, .register_index = 13 ); rgb0 += 128 * 4; 268 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 0, .register_index = 16 ); 269 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 64, .register_index = 17 ); 270 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 128, .register_index = 20 ); 271 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 192, .register_index = 21 ); 272 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 256, .register_index = 24 ); 273 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 320, .register_index = 25 ); 274 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 384, .register_index = 28 ); 275 | amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 448, .register_index = 29 ); rgb1 += 128 * 4; 276 | } 277 | } 278 | amx_clr(); 279 | #endif 280 | } 281 | //------------------------------------------------------------------------------ 282 | void yyy2rgb_amx(int width, int height, const void* y, int strideY, void* rgb, int strideRGB) 283 | { 284 | #if defined(__APPLE__) && defined(__aarch64__) 285 | int halfWidth = width >> 1; 286 | int halfHeight = height >> 1; 287 | 288 | static constexpr int16_t vector257[32] = { [0 ... 31] = 257 }; 289 | 290 | amx_set(); 291 | amx_ldy( .memory_offset = (uint64_t)vector257, .register_index = 1 ); 292 | for (int h = 0; h < halfHeight; ++h) 293 | { 294 | const unsigned char* y0 = (unsigned char*)y; 295 | const unsigned char* y1 = y0 + strideY; y = y1 + strideY; 296 | unsigned char* rgb0 = (unsigned char*)rgb; 297 | unsigned char* rgb1 = rgb0 + strideRGB; rgb = rgb1 + strideRGB; 298 | int halfWidth128 = width / 128; 299 | for (int w = 0; w < halfWidth128; ++w) 300 | { 301 | // Clear 302 | amx_mac16( .skip_x = 1, .skip_y = 1, .skip_z = 1, .mode_32 = 1 ); 303 | 304 | // Load 305 | amx_ldx( .memory_offset = (uint64_t)y0 + 0, .register_index = 0 ); 306 | amx_ldx( .memory_offset = (uint64_t)y0 + 64, .register_index = 2 ); y0 += 128; 307 | amx_ldx( .memory_offset = (uint64_t)y1 + 0, .register_index = 4 ); 308 | amx_ldx( .memory_offset = (uint64_t)y1 + 64, .register_index = 6 ); y1 += 128; 309 | 310 | // Y 311 | amx_vecint( .offset_x = 0x000, .offset_y = 0x040, .offset_z = 0, .count_x = 2, .extended = 12 ); 312 | amx_vecint( .offset_x = 0x080, .offset_y = 0x040, .offset_z = 8, .count_x = 2, .extended = 12 ); 313 | amx_vecint( .offset_x = 0x100, .offset_y = 0x040, .offset_z = 16, .count_x = 2, .extended = 12 ); 314 | amx_vecint( .offset_x = 0x180, .offset_y = 0x040, .offset_z = 24, .count_x = 2, .extended = 12 ); 315 | amx_vecint( .offset_x = 0x000, .offset_y = 0x000, .offset_z = 2, .count_x = 2, .extended = 11, .add = 1 ); 316 | amx_vecint( .offset_x = 0x080, .offset_y = 0x000, .offset_z = 10, .count_x = 2, .extended = 11, .add = 1 ); 317 | amx_vecint( .offset_x = 0x100, .offset_y = 0x000, .offset_z = 18, .count_x = 2, .extended = 11, .add = 1 ); 318 | amx_vecint( .offset_x = 0x180, .offset_y = 0x000, .offset_z = 26, .count_x = 2, .extended = 11, .add = 1 ); 319 | amx_vecint( .offset_x = 0x1E0, .offset_y = 0x000, .offset_z = 0, .count_x = 2, .extended = 11, .add = 1 ); 320 | amx_vecint( .offset_x = 0x060, .offset_y = 0x000, .offset_z = 8, .count_x = 2, .extended = 11, .add = 1 ); 321 | amx_vecint( .offset_x = 0x0E0, .offset_y = 0x000, .offset_z = 16, .count_x = 2, .extended = 11, .add = 1 ); 322 | amx_vecint( .offset_x = 0x160, .offset_y = 0x000, .offset_z = 24, .count_x = 2, .extended = 11, .add = 1 ); 323 | amx_vecint( .offset_x = 0x1E0, .offset_y = 0x000, .offset_z = 2, .count_x = 2, .extended = 12, .neg = 1, .add = 1 ); 324 | amx_vecint( .offset_x = 0x060, .offset_y = 0x000, .offset_z = 10, .count_x = 2, .extended = 12, .neg = 1, .add = 1 ); 325 | amx_vecint( .offset_x = 0x0E0, .offset_y = 0x000, .offset_z = 18, .count_x = 2, .extended = 12, .neg = 1, .add = 1 ); 326 | amx_vecint( .offset_x = 0x160, .offset_y = 0x000, .offset_z = 26, .count_x = 2, .extended = 12, .neg = 1, .add = 1 ); 327 | 328 | // Store 329 | amx_stz( .memory_offset = (uint64_t)rgb0 + 0, .register_index = 0 ); 330 | amx_stz( .memory_offset = (uint64_t)rgb0 + 64, .register_index = 1 ); 331 | amx_stz( .memory_offset = (uint64_t)rgb0 + 128, .register_index = 2 ); 332 | amx_stz( .memory_offset = (uint64_t)rgb0 + 192, .register_index = 3 ); 333 | amx_stz( .memory_offset = (uint64_t)rgb0 + 256, .register_index = 8 ); 334 | amx_stz( .memory_offset = (uint64_t)rgb0 + 320, .register_index = 9 ); 335 | amx_stz( .memory_offset = (uint64_t)rgb0 + 384, .register_index = 10 ); 336 | amx_stz( .memory_offset = (uint64_t)rgb0 + 448, .register_index = 11 ); rgb0 += 128 * 4; 337 | amx_stz( .memory_offset = (uint64_t)rgb1 + 0, .register_index = 16 ); 338 | amx_stz( .memory_offset = (uint64_t)rgb1 + 64, .register_index = 17 ); 339 | amx_stz( .memory_offset = (uint64_t)rgb1 + 128, .register_index = 18 ); 340 | amx_stz( .memory_offset = (uint64_t)rgb1 + 192, .register_index = 19 ); 341 | amx_stz( .memory_offset = (uint64_t)rgb1 + 256, .register_index = 24 ); 342 | amx_stz( .memory_offset = (uint64_t)rgb1 + 320, .register_index = 25 ); 343 | amx_stz( .memory_offset = (uint64_t)rgb1 + 384, .register_index = 26 ); 344 | amx_stz( .memory_offset = (uint64_t)rgb1 + 448, .register_index = 27 ); rgb1 += 128 * 4; 345 | } 346 | } 347 | amx_clr(); 348 | #endif 349 | } 350 | //------------------------------------------------------------------------------ 351 | void yuv2rgb_yu12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize) 352 | { 353 | int strideY = align(width, alignWidth); 354 | int strideU = align(width, alignWidth) / 2; 355 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 356 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 357 | 358 | if (strideRGB == 0) 359 | strideRGB = rgbWidth * width; 360 | 361 | auto converter = yuv2rgb_amx<3, false, false, false, false>; 362 | 363 | if (rgbWidth == 3) 364 | { 365 | if (rgbSwizzle) 366 | { 367 | if (fullRange) 368 | converter = yuv2rgb_amx<3, true, false, false, true>; 369 | else 370 | converter = yuv2rgb_amx<3, true, false, false, false>; 371 | } 372 | else 373 | { 374 | if (fullRange) 375 | converter = yuv2rgb_amx<3, false, false, false, true>; 376 | else 377 | converter = yuv2rgb_amx<3, false, false, false, false>; 378 | } 379 | } 380 | else if (rgbWidth == 4) 381 | { 382 | if (rgbSwizzle) 383 | { 384 | if (fullRange) 385 | converter = yuv2rgb_amx<4, true, false, false, true>; 386 | else 387 | converter = yuv2rgb_amx<4, true, false, false, false>; 388 | } 389 | else 390 | { 391 | if (fullRange) 392 | converter = yuv2rgb_amx<4, false, false, false, true>; 393 | else 394 | converter = yuv2rgb_amx<4, false, false, false, false>; 395 | } 396 | } 397 | 398 | converter(width, height, yuv, (char*)yuv + sizeY, (char*)yuv + sizeY + sizeU, strideY, strideU, strideU, rgb, strideRGB); 399 | } 400 | //------------------------------------------------------------------------------ 401 | void yuv2rgb_yv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize) 402 | { 403 | int strideY = align(width, alignWidth); 404 | int strideU = align(width, alignWidth) / 2; 405 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 406 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 407 | 408 | if (strideRGB == 0) 409 | strideRGB = rgbWidth * width; 410 | 411 | auto converter = yuv2rgb_amx<3, false, false, false, false>; 412 | 413 | if (rgbWidth == 3) 414 | { 415 | if (rgbSwizzle) 416 | { 417 | if (fullRange) 418 | converter = yuv2rgb_amx<3, true, false, false, true>; 419 | else 420 | converter = yuv2rgb_amx<3, true, false, false, false>; 421 | } 422 | else 423 | { 424 | if (fullRange) 425 | converter = yuv2rgb_amx<3, false, false, false, true>; 426 | else 427 | converter = yuv2rgb_amx<3, false, false, false, false>; 428 | } 429 | } 430 | else if (rgbWidth == 4) 431 | { 432 | if (rgbSwizzle) 433 | { 434 | if (fullRange) 435 | converter = yuv2rgb_amx<4, true, false, false, true>; 436 | else 437 | converter = yuv2rgb_amx<4, true, false, false, false>; 438 | } 439 | else 440 | { 441 | if (fullRange) 442 | converter = yuv2rgb_amx<4, false, false, false, true>; 443 | else 444 | converter = yuv2rgb_amx<4, false, false, false, false>; 445 | } 446 | } 447 | 448 | converter(width, height, yuv, (char*)yuv + sizeY + sizeU, (char*)yuv + sizeY, strideY, strideU, strideU, rgb, strideRGB); 449 | } 450 | //------------------------------------------------------------------------------ 451 | void yuv2rgb_nv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize) 452 | { 453 | int strideYUV = align(width, alignWidth); 454 | int sizeY = align(strideYUV * align(height, alignHeight), alignSize); 455 | int sizeUV = align(strideYUV * align(height, alignHeight) / 2, alignSize); 456 | 457 | if (strideRGB == 0) 458 | strideRGB = rgbWidth * width; 459 | 460 | auto converter = yuv2rgb_amx<3, false, false, false, false>; 461 | 462 | if (rgbWidth == 3) 463 | { 464 | if (rgbSwizzle) 465 | { 466 | if (fullRange) 467 | converter = yuv2rgb_amx<3, true, true, true, true>; 468 | else 469 | converter = yuv2rgb_amx<3, true, true, true, false>; 470 | } 471 | else 472 | { 473 | if (fullRange) 474 | converter = yuv2rgb_amx<3, false, true, true, true>; 475 | else 476 | converter = yuv2rgb_amx<3, false, true, true, false>; 477 | } 478 | } 479 | else if (rgbWidth == 4) 480 | { 481 | if (rgbSwizzle) 482 | { 483 | if (fullRange) 484 | converter = yuv2rgb_amx<4, true, true, true, true>; 485 | else 486 | converter = yuv2rgb_amx<4, true, true, true, false>; 487 | } 488 | else 489 | { 490 | if (fullRange) 491 | converter = yuv2rgb_amx<4, false, true, true, true>; 492 | else 493 | converter = yuv2rgb_amx<4, false, true, true, false>; 494 | } 495 | } 496 | 497 | converter(width, height, yuv, (char*)yuv + sizeY, (char*)yuv + sizeY + 1, strideYUV, strideYUV, strideYUV, rgb, strideRGB); 498 | } 499 | //------------------------------------------------------------------------------ 500 | void yuv2rgb_nv21_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize) 501 | { 502 | int strideYUV = align(width, alignWidth); 503 | int sizeY = align(strideYUV * align(height, alignHeight), alignSize); 504 | int sizeUV = align(strideYUV * align(height, alignHeight) / 2, alignSize); 505 | 506 | if (strideRGB == 0) 507 | strideRGB = rgbWidth * width; 508 | 509 | auto converter = yuv2rgb_amx<3, false, false, false, false>; 510 | 511 | if (rgbWidth == 3) 512 | { 513 | if (rgbSwizzle) 514 | { 515 | if (fullRange) 516 | converter = yuv2rgb_amx<3, true, true, false, true>; 517 | else 518 | converter = yuv2rgb_amx<3, true, true, false, false>; 519 | } 520 | else 521 | { 522 | if (fullRange) 523 | converter = yuv2rgb_amx<3, false, true, false, true>; 524 | else 525 | converter = yuv2rgb_amx<3, false, true, false, false>; 526 | } 527 | } 528 | else if (rgbWidth == 4) 529 | { 530 | if (rgbSwizzle) 531 | { 532 | if (fullRange) 533 | converter = yuv2rgb_amx<4, true, true, false, true>; 534 | else 535 | converter = yuv2rgb_amx<4, true, true, false, false>; 536 | } 537 | else 538 | { 539 | if (fullRange) 540 | converter = yuv2rgb_amx<4, false, true, false, true>; 541 | else 542 | converter = yuv2rgb_amx<4, false, true, false, false>; 543 | } 544 | } 545 | 546 | converter(width, height, yuv, (char*)yuv + sizeY + 1, (char*)yuv + sizeY, strideYUV, strideYUV, strideYUV, rgb, strideRGB); 547 | } 548 | //------------------------------------------------------------------------------ 549 | -------------------------------------------------------------------------------- /yuv2rgb_amx.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2rgb_amx Header 3 | // 4 | // Copyright (c) 2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #ifndef xxYUV_EXPORT 10 | #define xxYUV_EXPORT 11 | #endif 12 | 13 | //------------------------------------------------------------------------------ 14 | template 15 | xxYUV_EXPORT void yuv2rgb_amx(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB); 16 | //------------------------------------------------------------------------------ 17 | xxYUV_EXPORT void yyy2rgb_amx(int width, int height, const void* y, int strideY, void* rgb, int strideRGB); 18 | //------------------------------------------------------------------------------ 19 | xxYUV_EXPORT void yuv2rgb_yu12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1); 20 | xxYUV_EXPORT void yuv2rgb_yv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1); 21 | xxYUV_EXPORT void yuv2rgb_nv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1); 22 | xxYUV_EXPORT void yuv2rgb_nv21_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1); 23 | //------------------------------------------------------------------------------ 24 | -------------------------------------------------------------------------------- /yuv2yuva.cpp: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2yuva Source 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #if defined(__llvm__) 8 | # pragma clang diagnostic ignored "-Wunused-variable" 9 | #endif 10 | #include "cpu.h" 11 | #include "yuv2yuva.inl" 12 | #include "yuv2yuva.h" 13 | 14 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1)) 15 | 16 | //------------------------------------------------------------------------------ 17 | void yuv2yuva_yu12(const yuv2yuva_parameter* parameter) 18 | { 19 | int width = parameter->width; 20 | int height = parameter->height; 21 | 22 | const void* y = parameter->y; 23 | const void* u = parameter->u; 24 | const void* v = parameter->v; 25 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 26 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 27 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 28 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 29 | int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2; 30 | int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2; 31 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 32 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 33 | 34 | void* output = parameter->output; 35 | bool swizzleOutput = parameter->swizzleOutput; 36 | int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width; 37 | if (strideOutput < 0) 38 | { 39 | output = (char*)output - (strideOutput * (height - 1)); 40 | } 41 | 42 | u = u ? u : (char*)y + sizeY; 43 | v = v ? v : (char*)y + sizeY + sizeU; 44 | 45 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput); 46 | 47 | if (swizzleOutput) 48 | { 49 | static auto select = yuv2yuva_select(false, false, 2, 1, 0, 3); 50 | converter = select; 51 | } 52 | else 53 | { 54 | static auto select = yuv2yuva_select(false, false, 0, 1, 2, 3); 55 | converter = select; 56 | } 57 | 58 | converter(width, height, y, u, v, strideY, strideU, strideV, output, strideOutput); 59 | } 60 | //------------------------------------------------------------------------------ 61 | void yuv2yuva_yv12(const yuv2yuva_parameter* parameter) 62 | { 63 | int width = parameter->width; 64 | int height = parameter->height; 65 | 66 | const void* y = parameter->y; 67 | const void* u = parameter->u; 68 | const void* v = parameter->v; 69 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 70 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 71 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 72 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 73 | int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2; 74 | int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2; 75 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 76 | int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize); 77 | 78 | void* output = parameter->output; 79 | bool swizzleOutput = parameter->swizzleOutput; 80 | int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width; 81 | if (strideOutput < 0) 82 | { 83 | output = (char*)output - (strideOutput * (height - 1)); 84 | } 85 | 86 | u = u ? u : (char*)y + sizeY + sizeU; 87 | v = v ? v : (char*)y + sizeY; 88 | 89 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput); 90 | 91 | if (swizzleOutput) 92 | { 93 | static auto select = yuv2yuva_select(false, false, 2, 1, 0, 3); 94 | converter = select; 95 | } 96 | else 97 | { 98 | static auto select = yuv2yuva_select(false, false, 0, 1, 2, 3); 99 | converter = select; 100 | } 101 | 102 | converter(width, height, y, u, v, strideY, strideU, strideV, output, strideOutput); 103 | } 104 | //------------------------------------------------------------------------------ 105 | void yuv2yuva_nv12(const yuv2yuva_parameter* parameter) 106 | { 107 | int width = parameter->width; 108 | int height = parameter->height; 109 | 110 | const void* y = parameter->y; 111 | const void* u = parameter->u; 112 | const void* v = parameter->v; 113 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 114 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 115 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 116 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 117 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 118 | int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize); 119 | 120 | void* output = parameter->output; 121 | bool swizzleOutput = parameter->swizzleOutput; 122 | int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width; 123 | if (strideOutput < 0) 124 | { 125 | output = (char*)output - (strideOutput * (height - 1)); 126 | } 127 | 128 | u = u ? u : (char*)y + sizeY; 129 | v = v ? v : (char*)y + sizeY + 1; 130 | 131 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput); 132 | 133 | if (swizzleOutput) 134 | { 135 | static auto select = yuv2yuva_select(true, true, 2, 1, 0, 3); 136 | converter = select; 137 | } 138 | else 139 | { 140 | static auto select = yuv2yuva_select(true, true, 0, 1, 2, 3); 141 | converter = select; 142 | } 143 | 144 | converter(width, height, y, u, v, strideY, strideY, strideY, output, strideOutput); 145 | } 146 | //------------------------------------------------------------------------------ 147 | void yuv2yuva_nv21(const yuv2yuva_parameter* parameter) 148 | { 149 | int width = parameter->width; 150 | int height = parameter->height; 151 | 152 | const void* y = parameter->y; 153 | const void* u = parameter->u; 154 | const void* v = parameter->v; 155 | int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16; 156 | int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1; 157 | int alignSize = parameter->alignSize ? parameter->alignSize : 1; 158 | int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth); 159 | int sizeY = align(strideY * align(height, alignHeight), alignSize); 160 | int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize); 161 | 162 | void* output = parameter->output; 163 | bool swizzleOutput = parameter->swizzleOutput; 164 | int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width; 165 | if (strideOutput < 0) 166 | { 167 | output = (char*)output - (strideOutput * (height - 1)); 168 | } 169 | 170 | u = u ? u : (char*)y + sizeY + 1; 171 | v = v ? v : (char*)y + sizeY; 172 | 173 | void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput); 174 | 175 | if (swizzleOutput) 176 | { 177 | static auto select = yuv2yuva_select(true, false, 2, 1, 0, 3); 178 | converter = select; 179 | } 180 | else 181 | { 182 | static auto select = yuv2yuva_select(true, false, 0, 1, 2, 3); 183 | converter = select; 184 | } 185 | 186 | converter(width, height, y, u, v, strideY, strideY, strideY, output, strideOutput); 187 | } 188 | //------------------------------------------------------------------------------ 189 | -------------------------------------------------------------------------------- /yuv2yuva.h: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2yuva Header 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #pragma once 8 | 9 | #ifndef xxYUV_EXPORT 10 | #define xxYUV_EXPORT 11 | #endif 12 | 13 | //------------------------------------------------------------------------------ 14 | typedef struct _yuv2yuva_parameter 15 | { 16 | int width; 17 | int height; 18 | 19 | const void* y; 20 | const void* u; 21 | const void* v; 22 | int alignWidth; 23 | int alignHeight; 24 | int alignSize; 25 | int strideY; 26 | int strideU; 27 | int strideV; 28 | 29 | void* output; 30 | int strideOutput; 31 | bool swizzleOutput; 32 | } yuv2yuva_parameter; 33 | //------------------------------------------------------------------------------ 34 | xxYUV_EXPORT void yuv2yuva_yu12(const yuv2yuva_parameter* parameter); 35 | xxYUV_EXPORT void yuv2yuva_yv12(const yuv2yuva_parameter* parameter); 36 | xxYUV_EXPORT void yuv2yuva_nv12(const yuv2yuva_parameter* parameter); 37 | xxYUV_EXPORT void yuv2yuva_nv21(const yuv2yuva_parameter* parameter); 38 | //------------------------------------------------------------------------------ 39 | -------------------------------------------------------------------------------- /yuv2yuva.inl: -------------------------------------------------------------------------------- 1 | //============================================================================== 2 | // xxYUV : yuv2yuva Inline 3 | // 4 | // Copyright (c) 2020-2021 TAiGA 5 | // https://github.com/metarutaiga/xxYUV 6 | //============================================================================== 7 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) 8 | # include 9 | #elif defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 10 | # include 11 | # if defined(__llvm__) 12 | # include 13 | # include 14 | # endif 15 | # define _MM_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \ 16 | __m128i T0, T1, T2, T3; \ 17 | T0 = _mm_unpacklo_epi8(R0, R1); \ 18 | T1 = _mm_unpacklo_epi8(R2, R3); \ 19 | T2 = _mm_unpackhi_epi8(R0, R1); \ 20 | T3 = _mm_unpackhi_epi8(R2, R3); \ 21 | R0 = _mm_unpacklo_epi16(T0, T1); \ 22 | R1 = _mm_unpackhi_epi16(T0, T1); \ 23 | R2 = _mm_unpacklo_epi16(T2, T3); \ 24 | R3 = _mm_unpackhi_epi16(T2, T3); \ 25 | } 26 | # define _MM256_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \ 27 | __m256i T0, T1, T2, T3; \ 28 | T0 = _mm256_unpacklo_epi8(R0, R1); \ 29 | T1 = _mm256_unpacklo_epi8(R2, R3); \ 30 | T2 = _mm256_unpackhi_epi8(R0, R1); \ 31 | T3 = _mm256_unpackhi_epi8(R2, R3); \ 32 | R0 = _mm256_unpacklo_epi16(T0, T1); \ 33 | R1 = _mm256_unpackhi_epi16(T0, T1); \ 34 | R2 = _mm256_unpacklo_epi16(T2, T3); \ 35 | R3 = _mm256_unpackhi_epi16(T2, T3); \ 36 | } 37 | # define _MM256_TRANSPOSE4_SI128(R0, R1, R2, R3) {\ 38 | __m256i T0, T1, T2, T3; \ 39 | T0 = _mm256_permute2x128_si256(R0, R1, 32); \ 40 | T1 = _mm256_permute2x128_si256(R0, R1, 49); \ 41 | T2 = _mm256_permute2x128_si256(R2, R3, 32); \ 42 | T3 = _mm256_permute2x128_si256(R2, R3, 49); \ 43 | R0 = T0; \ 44 | R2 = T1; \ 45 | R1 = T2; \ 46 | R3 = T3; \ 47 | } 48 | #endif 49 | 50 | //------------------------------------------------------------------------------ 51 | template 52 | void yuv2yuva(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput) 53 | { 54 | int halfWidth = width >> 1; 55 | int halfHeight = height >> 1; 56 | 57 | for (int h = 0; h < halfHeight; ++h) 58 | { 59 | const unsigned char* y0 = (unsigned char*)y; 60 | const unsigned char* y1 = y0 + strideY; y = y1 + strideY; 61 | const unsigned char* u0 = (unsigned char*)u; u = u0 + strideU; 62 | const unsigned char* v0 = (unsigned char*)v; v = v0 + strideV; 63 | unsigned char* output0 = (unsigned char*)output; 64 | unsigned char* output1 = output0 + strideOutput; output = output1 + strideOutput; 65 | #if HAVE_NEON 66 | int halfWidth8 = halfWidth / 8; 67 | for (int w = 0; w < halfWidth8; ++w) 68 | { 69 | uint8x16_t y00 = vld1q_u8(y0); y0 += 16; 70 | uint8x16_t y10 = vld1q_u8(y1); y1 += 16; 71 | 72 | int8x8x2_t u000; 73 | int8x8x2_t v000; 74 | int8x16_t u00; 75 | int8x16_t v00; 76 | if (interleaved) 77 | { 78 | if (firstU) 79 | { 80 | int8x16_t uv00 = vld1q_u8(u0); u0 += 16; 81 | int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00)); 82 | u000 = vzip_s8(uv00lh.val[0], uv00lh.val[0]); 83 | v000 = vzip_s8(uv00lh.val[1], uv00lh.val[1]); 84 | } 85 | else 86 | { 87 | int8x16_t uv00 = vld1q_u8(v0); v0 += 16; 88 | int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00)); 89 | u000 = vzip_s8(uv00lh.val[1], uv00lh.val[1]); 90 | v000 = vzip_s8(uv00lh.val[0], uv00lh.val[0]); 91 | } 92 | } 93 | else 94 | { 95 | int8x8_t u0000 = vld1_u8(u0); u0 += 8; 96 | int8x8_t v0000 = vld1_u8(v0); v0 += 8; 97 | u000 = vzip_s8(u0000, u0000); 98 | v000 = vzip_s8(v0000, v0000); 99 | } 100 | u00 = vcombine_s8(u000.val[0], u000.val[1]); 101 | v00 = vcombine_s8(v000.val[0], v000.val[1]); 102 | 103 | uint8x16x4_t t; 104 | uint8x16x4_t b; 105 | 106 | t.val[iY] = y00; 107 | t.val[iU] = u00; 108 | t.val[iV] = v00; 109 | t.val[iA] = vdupq_n_u8(255); 110 | b.val[iY] = y10; 111 | b.val[iU] = u00; 112 | b.val[iV] = v00; 113 | b.val[iA] = vdupq_n_u8(255); 114 | 115 | vst4q_u8(output0, t); output0 += 16 * 4; 116 | vst4q_u8(output1, b); output1 += 16 * 4; 117 | } 118 | continue; 119 | #elif HAVE_AVX2 120 | int halfWidth16 = halfWidth / 16; 121 | for (int w = 0; w < halfWidth16; ++w) 122 | { 123 | __m256i y00 = _mm256_loadu_si256((__m256i*)y0); y0 += 32; 124 | __m256i y10 = _mm256_loadu_si256((__m256i*)y1); y1 += 32; 125 | 126 | __m256i u00; 127 | __m256i v00; 128 | if (interleaved) 129 | { 130 | if (firstU) 131 | { 132 | __m256i uv00 = _mm256_loadu_si256((__m256i*)u0); u0 += 32; 133 | u00 = _mm256_and_si256(uv00, _mm256_set1_epi16(0xFF)); 134 | v00 = _mm256_srli_epi16(uv00, 8); 135 | } 136 | else 137 | { 138 | __m256i uv00 = _mm256_loadu_si256((__m256i*)v0); v0 += 32; 139 | u00 = _mm256_srli_epi16(uv00, 8); 140 | v00 = _mm256_and_si256(uv00, _mm256_set1_epi16(0xFF)); 141 | } 142 | u00 = _mm256_packus_epi16(u00, u00); 143 | v00 = _mm256_packus_epi16(v00, v00); 144 | } 145 | else 146 | { 147 | __m128i u000 = _mm_loadu_si128((__m128i*)u0); u0 += 16; 148 | __m128i v000 = _mm_loadu_si128((__m128i*)v0); v0 += 16; 149 | u00 = _mm256_castsi128_si256(u000); 150 | v00 = _mm256_castsi128_si256(v000); 151 | } 152 | u00 = _mm256_unpacklo_epi8(u00, u00); 153 | v00 = _mm256_unpacklo_epi8(v00, v00); 154 | 155 | __m256i t[4]; 156 | __m256i b[4]; 157 | 158 | t[iY] = y00; 159 | t[iU] = u00; 160 | t[iV] = v00; 161 | t[iA] = _mm256_set1_epi8(-1); 162 | b[iY] = y10; 163 | b[iU] = u00; 164 | b[iV] = v00; 165 | b[iA] = _mm256_set1_epi8(-1); 166 | 167 | _MM256_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]); 168 | _MM256_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]); 169 | _MM256_TRANSPOSE4_SI128(t[0], t[1], t[2], t[3]); 170 | _MM256_TRANSPOSE4_SI128(b[0], b[1], b[2], b[3]); 171 | 172 | _mm256_storeu_si256((__m256i*)output0 + 0, t[0]); 173 | _mm256_storeu_si256((__m256i*)output0 + 1, t[1]); 174 | _mm256_storeu_si256((__m256i*)output0 + 2, t[2]); 175 | _mm256_storeu_si256((__m256i*)output0 + 3, t[3]); 176 | _mm256_storeu_si256((__m256i*)output1 + 0, b[0]); output0 += 16 * 8; 177 | _mm256_storeu_si256((__m256i*)output1 + 1, b[1]); 178 | _mm256_storeu_si256((__m256i*)output1 + 2, b[2]); 179 | _mm256_storeu_si256((__m256i*)output1 + 3, b[3]); output1 += 16 * 8; 180 | } 181 | continue; 182 | #elif HAVE_SSE2 183 | int halfWidth8 = halfWidth / 8; 184 | for (int w = 0; w < halfWidth8; ++w) 185 | { 186 | __m128i y00 = _mm_loadu_si128((__m128i*)y0); y0 += 16; 187 | __m128i y10 = _mm_loadu_si128((__m128i*)y1); y1 += 16; 188 | 189 | __m128i u00; 190 | __m128i v00; 191 | if (interleaved) 192 | { 193 | if (firstU) 194 | { 195 | __m128i uv00 = _mm_loadu_si128((__m128i*)u0); u0 += 16; 196 | u00 = _mm_and_si128(uv00, _mm_set1_epi16(0xFF)); 197 | v00 = _mm_srli_epi16(uv00, 8); 198 | } 199 | else 200 | { 201 | __m128i uv00 = _mm_loadu_si128((__m128i*)v0); v0 += 16; 202 | u00 = _mm_srli_epi16(uv00, 8); 203 | v00 = _mm_and_si128(uv00, _mm_set1_epi16(0xFF)); 204 | } 205 | u00 = _mm_packus_epi16(u00, u00); 206 | v00 = _mm_packus_epi16(v00, v00); 207 | } 208 | else 209 | { 210 | u00 = _mm_loadl_epi64((__m128i*)u0); u0 += 8; 211 | v00 = _mm_loadl_epi64((__m128i*)v0); v0 += 8; 212 | } 213 | u00 = _mm_unpacklo_epi8(u00, u00); 214 | v00 = _mm_unpacklo_epi8(v00, v00); 215 | 216 | __m128i t[4]; 217 | __m128i b[4]; 218 | 219 | t[iY] = y00; 220 | t[iU] = u00; 221 | t[iV] = v00; 222 | t[iA] = _mm_set1_epi8(-1); 223 | b[iY] = y10; 224 | b[iU] = u00; 225 | b[iV] = v00; 226 | b[iA] = _mm_set1_epi8(-1); 227 | 228 | _MM_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]); 229 | _MM_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]); 230 | 231 | _mm_storeu_si128((__m128i*)output0 + 0, t[0]); 232 | _mm_storeu_si128((__m128i*)output0 + 1, t[1]); 233 | _mm_storeu_si128((__m128i*)output0 + 2, t[2]); 234 | _mm_storeu_si128((__m128i*)output0 + 3, t[3]); output0 += 16 * 4; 235 | _mm_storeu_si128((__m128i*)output1 + 0, b[0]); 236 | _mm_storeu_si128((__m128i*)output1 + 1, b[1]); 237 | _mm_storeu_si128((__m128i*)output1 + 2, b[2]); 238 | _mm_storeu_si128((__m128i*)output1 + 3, b[3]); output1 += 16 * 4; 239 | } 240 | continue; 241 | #endif 242 | for (int w = 0; w < halfWidth; ++w) 243 | { 244 | auto y00 = (*y0++); 245 | auto y01 = (*y0++); 246 | auto y10 = (*y1++); 247 | auto y11 = (*y1++); 248 | 249 | auto u00 = (*u0++); 250 | auto v00 = (*v0++); 251 | if (interleaved) 252 | { 253 | u0++; 254 | v0++; 255 | } 256 | 257 | output0[iY] = y00; 258 | output0[iU] = u00; 259 | output0[iV] = v00; 260 | output0[iA] = 255; 261 | output0 += 4; 262 | 263 | output0[iY] = y01; 264 | output0[iU] = u00; 265 | output0[iV] = v00; 266 | output0[iA] = 255; 267 | output0 += 4; 268 | 269 | output1[iY] = y10; 270 | output1[iU] = u00; 271 | output1[iV] = v00; 272 | output1[iA] = 255; 273 | output1 += 4; 274 | 275 | output1[iY] = y11; 276 | output1[iU] = u00; 277 | output1[iV] = v00; 278 | output1[iA] = 255; 279 | output1 += 4; 280 | } 281 | } 282 | } 283 | //------------------------------------------------------------------------------ 284 | #ifndef yuv2yuva_select 285 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \ 286 | yuv2yuva 287 | #endif 288 | //------------------------------------------------------------------------------ 289 | #ifndef yuv2yuva 290 | //------------------------------------------------------------------------------ 291 | #if defined(__llvm__) 292 | #define yuv2yuva_attribute(value) __attribute__((target(value))) 293 | #else 294 | #define yuv2yuva_attribute(value) 295 | #endif 296 | //------------------------------------------------------------------------------ 297 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) 298 | #define HAVE_NEON 1 299 | #define yuv2yuva yuv2yuva_attribute("neon") yuv2yuva_neon 300 | #include "yuv2yuva.inl" 301 | #undef yuv2yuva 302 | #undef HAVE_NEON 303 | #undef yuv2yuva_select 304 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \ 305 | neon() ? yuv2yuva_neon : \ 306 | yuv2yuva 307 | #endif 308 | //------------------------------------------------------------------------------ 309 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 310 | #define HAVE_SSE2 1 311 | #define yuv2yuva yuv2yuva_attribute("sse2") yuv2yuva_sse2 312 | #include "yuv2yuva.inl" 313 | #undef yuv2yuva 314 | #undef HAVE_SSE2 315 | #undef yuv2yuva_select 316 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \ 317 | sse2() ? yuv2yuva_sse2 : \ 318 | yuv2yuva 319 | #endif 320 | //------------------------------------------------------------------------------ 321 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__) 322 | #define HAVE_AVX2 1 323 | #define yuv2yuva yuv2yuva_attribute("avx2") yuv2yuva_avx2 324 | #include "yuv2yuva.inl" 325 | #undef yuv2yuva 326 | #undef HAVE_AVX2 327 | #undef yuv2yuva_select 328 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \ 329 | avx2() ? yuv2yuva_avx2 : \ 330 | sse2() ? yuv2yuva_sse2 : \ 331 | yuv2yuva 332 | #endif 333 | //------------------------------------------------------------------------------ 334 | #endif 335 | //------------------------------------------------------------------------------ 336 | --------------------------------------------------------------------------------