├── LICENSE.txt
├── README.md
├── apple_amx.h
├── cpu.cpp
├── cpu.h
├── rgb2yuv.cpp
├── rgb2yuv.h
├── rgb2yuv.inl
├── yuv.h
├── yuv2rgb.cpp
├── yuv2rgb.h
├── yuv2rgb.inl
├── yuv2rgb_amx.cpp
├── yuv2rgb_amx.h
├── yuv2yuva.cpp
├── yuv2yuva.h
└── yuv2yuva.inl


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020-2021 TAiGA
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # xxYUV
 2 | Convert between RGB and YUV
 3 | 
 4 | ## Benchmark Environment
 5 | https://github.com/metarutaiga/xxImGui/tree/experimental
 6 | 
 7 | ## Performance (macOS) Encode / Decode (unit : us)
 8 |                                                     
 9 | |            |           | Encode      |             | Decode      |             |
10 | | ---------- | --------- | ----------- | ----------- | ----------- | ----------- |
11 | |            | Apple M1  | YU12 / YV12 | NV12 / NV21 | YU12 / YV12 | NV12 / NV21 |
12 | | xxYUV      | AMX       |           ? |           ? |          67 |           ? |
13 | | xxYUV      | NEON      |          37 |          38 |          38 |          42 |
14 | | libyuv     | NEON      |          48 |          49 |         122 |          89 |
15 | | Accelerate | NEON      |          56 |          55 |          62 |          59 |
16 | | xxYUV      | SSSE3     |         134 |         133 |             |             |
17 | | libyuv     | SSSE3     |         146 |         146 |         171 |         164 |
18 | | Accelerate | SSSE3     |         273 |         274 |         232 |         231 |
19 | | xxYUV      | SSE2      |         142 |         143 |          58 |          56 |
20 | 
21 | |            |           | Encode      |             | Decode      |             |
22 | | ---------- | --------- | ----------- | ----------- | ----------- | ----------- |
23 | |            | i7-8700B  | YU12 / YV12 | NV12 / NV21 | YU12 / YV12 | NV12 / NV21 |
24 | | xxYUV      | AVX2      |          31 |          33 |          46 |          39 |
25 | | libyuv     | AVX2      |          48 |          39 |          60 |          54 |
26 | | Accelerate | AVX2      |          83 |          84 |          67 |          62 |
27 | | xxYUV      | SSSE3     |          50 |          51 |             |             |
28 | | libyuv     | SSSE3     |          60 |          61 |          87 |          82 |
29 | | xxYUV      | SSE2      |          90 |          91 |          69 |          62 |
30 | 


--------------------------------------------------------------------------------
/apple_amx.h:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : Apple AMX Header
  3 | //
  4 | // Copyright (c) 2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #pragma once
  8 | 
  9 | #if defined(__APPLE__) && defined(__aarch64__)
 10 | //------------------------------------------------------------------------------
 11 | //  https://gist.github.com/dougallj/7a75a3be1ec69ca550e7c36dc75e0d6f
 12 | //  https://gist.github.com/dougallj/7cba721da1a94da725ee37c1e9cd1f21
 13 | //------------------------------------------------------------------------------
 14 | #include <stdio.h>
 15 | #include <stdint.h>
 16 | //------------------------------------------------------------------------------
 17 | union amx_operands_access
 18 | {
 19 |     struct
 20 |     {
 21 |         uint64_t memory_offset:56;
 22 |         uint64_t register_index:6;
 23 |         uint64_t double_width:1;
 24 |         uint64_t dummy_63:1;
 25 |     };
 26 |     uint64_t value;
 27 | };
 28 | //------------------------------------------------------------------------------
 29 | union amx_operands_extract
 30 | {
 31 |     struct
 32 |     {
 33 |         uint64_t offset_y:10;
 34 |         uint64_t offset_x:10;
 35 |         uint64_t offset_z:7;
 36 |         uint64_t dummy_27:37;
 37 |     };
 38 |     uint64_t value;
 39 | };
 40 | //------------------------------------------------------------------------------
 41 | union amx_operands_scalar
 42 | {
 43 |     struct
 44 |     {
 45 |         uint64_t offset_y:10;
 46 |         uint64_t offset_x:10;
 47 |         uint64_t offset_z:7;
 48 |         uint64_t skip_z:1;
 49 |         uint64_t skip_y:1;
 50 |         uint64_t skip_x:1;
 51 |         uint64_t dummy_30:2;
 52 |         uint64_t disable_x:7;
 53 |         uint64_t dummy_39:2;
 54 |         uint64_t disable_y:7;
 55 |         uint64_t dummy_48:13;
 56 |         uint64_t mode_8:1;
 57 |         uint64_t mode_32:1;
 58 |         uint64_t vector_matrix_add:1;
 59 |     };
 60 |     uint64_t value;
 61 | };
 62 | //------------------------------------------------------------------------------
 63 | union amx_operands_vector
 64 | {
 65 |     struct
 66 |     {
 67 |         uint64_t offset_y:10;
 68 |         uint64_t offset_x:10;
 69 |         uint64_t offset_z:7;
 70 |         uint64_t count_y:2;
 71 |         uint64_t count_x:2;
 72 |         uint64_t dummy_31:1;
 73 |         uint64_t mask:10;
 74 |         uint64_t extended:4;
 75 |         uint64_t dummy_46:1;
 76 |         uint64_t neg:1;
 77 |         uint64_t add:1;
 78 |         uint64_t dummy_49:9;
 79 |         uint64_t shift_right:5;
 80 |         uint64_t sign:1;
 81 |     };
 82 |     uint64_t value;
 83 | };
 84 | //------------------------------------------------------------------------------
 85 | union amx_operands_matrix
 86 | {
 87 |     struct
 88 |     {
 89 |         uint64_t offset_y:10;
 90 |         uint64_t offset_x:10;
 91 |         uint64_t offset_z:7;
 92 |         uint64_t dummy_27:5;
 93 |         uint64_t mask:10;
 94 |         uint64_t extended:4;
 95 |         uint64_t dummy_46:1;
 96 |         uint64_t neg:1;
 97 |         uint64_t add:1;
 98 |         uint64_t dummy_49:9;
 99 |         uint64_t shift_right:5;
100 |         uint64_t sign:1;
101 |     };
102 |     uint64_t value;
103 | };
104 | //------------------------------------------------------------------------------
105 | #define amx_ldx(...)    __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 0 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
106 | #define amx_ldy(...)    __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 1 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
107 | #define amx_stx(...)    __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 2 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
108 | #define amx_sty(...)    __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 3 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
109 | #define amx_ldz(...)    __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 4 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
110 | #define amx_stz(...)    __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 5 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
111 | #define amx_ldzi(...)   __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 6 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
112 | #define amx_stzi(...)   __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 7 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
113 | #define amx_extrx(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 8 << 5) | 0)" ::"r"((amx_operands_extract{__VA_ARGS__})) : "x0", "memory")
114 | #define amx_extry(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | ( 9 << 5) | 0)" ::"r"((amx_operands_extract{__VA_ARGS__})) : "x0", "memory")
115 | #define amx_fma64(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (10 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
116 | #define amx_fms64(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (11 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
117 | #define amx_fma32(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (12 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
118 | #define amx_fms32(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (13 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
119 | #define amx_mac16(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (14 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
120 | #define amx_fma16(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (15 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
121 | #define amx_fms16(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (16 << 5) | 0)" ::"r"((amx_operands_scalar{__VA_ARGS__})) : "x0", "memory")
122 | #define amx_set()       __asm__ volatile("nop \n nop \n nop \n .word (0x201000 | (17 << 5) | 0)" ::: "memory")
123 | #define amx_clr()       __asm__ volatile("nop \n nop \n nop \n .word (0x201000 | (17 << 5) | 1)" ::: "memory")
124 | #define amx_vecint(...) __asm__ volatile("mov x0, %0        \n .word (0x201000 | (18 << 5) | 0)" ::"r"((amx_operands_vector{__VA_ARGS__})) : "x0", "memory")
125 | #define amx_vecfp(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (19 << 5) | 0)" ::"r"((amx_operands_vector{__VA_ARGS__})) : "x0", "memory")
126 | #define amx_matint(...) __asm__ volatile("mov x0, %0        \n .word (0x201000 | (20 << 5) | 0)" ::"r"((amx_operands_matrix{__VA_ARGS__})) : "x0", "memory")
127 | #define amx_matfp(...)  __asm__ volatile("mov x0, %0        \n .word (0x201000 | (21 << 5) | 0)" ::"r"((amx_operands_matrix{__VA_ARGS__})) : "x0", "memory")
128 | #define amx_gemlut(...) __asm__ volatile("mov x0, %0        \n .word (0x201000 | (22 << 5) | 0)" ::"r"((amx_operands_access{__VA_ARGS__})) : "x0", "memory")
129 | //------------------------------------------------------------------------------
130 | inline void amx_dump(int index, int16_t hint)
131 | {
132 |     uint8_t row[64];
133 |     amx_stz( .memory_offset = (uint64_t)row, .register_index = (uint64_t)index );
134 |     printf("%2d : ", index);
135 |     for (int i = 0; i < 64; ++i)
136 |     {
137 |         if (i == 0)
138 |         {
139 |             printf("(%04X) ", __builtin_bswap16(hint) & 0xFFFF);
140 |         }
141 |         printf("%02X", row[i]);
142 |         if (i % 8 == 7)
143 |             printf(" ");
144 |     }
145 |     printf("\n");
146 | }
147 | //------------------------------------------------------------------------------
148 | #endif
149 | 


--------------------------------------------------------------------------------
/cpu.cpp:
--------------------------------------------------------------------------------
 1 | //==============================================================================
 2 | // xxYUV : cpu Source
 3 | //
 4 | // Copyright (c) 2020-2021 TAiGA
 5 | // https://github.com/metarutaiga/xxYUV
 6 | //==============================================================================
 7 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
 8 | #   if defined(_M_IX86) || defined(_M_AMD64)
 9 | #       include <intrin.h>
10 | #       define bit_SSSE3    (1 << 9)
11 | #       define bit_AVX2     (1 << 5)
12 | #       define bit_AVX512BW (1 << 30)
13 |         static inline int __get_cpuid(int leaf, unsigned int* eax, unsigned int* ebx, unsigned int* ecx, unsigned int* edx)
14 |         {
15 |             int regs[4];
16 |             __cpuid(regs, leaf);
17 |             *eax = regs[0];
18 |             *ebx = regs[1];
19 |             *ecx = regs[2];
20 |             *edx = regs[3];
21 |             return 1;
22 |         }
23 |         static inline int __get_cpuid_count(int leaf, int subleaf, unsigned int* eax, unsigned int* ebx, unsigned int* ecx, unsigned int* edx)
24 |         {
25 |             int regs[4];
26 |             __cpuidex(regs, leaf, subleaf);
27 |             *eax = regs[0];
28 |             *ebx = regs[1];
29 |             *ecx = regs[2];
30 |             *edx = regs[3];
31 |             return 1;
32 |         }
33 | #   elif defined(__i386__) || defined(__amd64__)
34 | #       include <cpuid.h>
35 | #   endif
36 | #   include <immintrin.h>
37 | #endif
38 | #include "cpu.h"
39 | 
40 | //------------------------------------------------------------------------------
41 | bool ssse3()
42 | {
43 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
44 |     unsigned int eax, ebx, ecx, edx;
45 |     if (__get_cpuid(1, &eax, &ebx, &ecx, &edx))
46 |     {
47 |         return (ecx & bit_SSSE3) != 0;
48 |     }
49 | #endif
50 |     return false;
51 | }
52 | //------------------------------------------------------------------------------
53 | bool avx2()
54 | {
55 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
56 |     unsigned int eax, ebx, ecx, edx;
57 |     if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx))
58 |     {
59 |         return (ebx & bit_AVX2) != 0;
60 |     }
61 | #endif
62 |     return false;
63 | }
64 | //------------------------------------------------------------------------------
65 | bool avx512bw()
66 | {
67 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
68 |     unsigned int eax, ebx, ecx, edx;
69 |     if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx))
70 |     {
71 |         return (ebx & bit_AVX512BW) != 0;
72 |     }
73 | #endif
74 |     return false;
75 | }
76 | //------------------------------------------------------------------------------
77 | 


--------------------------------------------------------------------------------
/cpu.h:
--------------------------------------------------------------------------------
 1 | //==============================================================================
 2 | // xxYUV : cpu Header
 3 | //
 4 | // Copyright (c) 2020-2021 TAiGA
 5 | // https://github.com/metarutaiga/xxYUV
 6 | //==============================================================================
 7 | #pragma once
 8 | 
 9 | #ifndef xxYUV_EXPORT
10 | #define xxYUV_EXPORT
11 | #endif
12 | 
13 | //------------------------------------------------------------------------------
14 | inline bool neon() { return true; }
15 | //------------------------------------------------------------------------------
16 | inline bool sse2() { return true; }
17 | xxYUV_EXPORT bool ssse3();
18 | xxYUV_EXPORT bool avx2();
19 | xxYUV_EXPORT bool avx512bw();
20 | //------------------------------------------------------------------------------
21 | 


--------------------------------------------------------------------------------
/rgb2yuv.cpp:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : rgb2yuv Source
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #if defined(__llvm__)
  8 | #   pragma clang diagnostic ignored "-Wunused-variable"
  9 | #endif
 10 | #include "cpu.h"
 11 | #include "rgb2yuv.inl"
 12 | #include "rgb2yuv.h"
 13 | 
 14 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1))
 15 | 
 16 | //------------------------------------------------------------------------------
 17 | void rgb2yuv_yu12(const rgb2yuv_parameter* parameter)
 18 | {
 19 |     int width = parameter->width;
 20 |     int height = parameter->height;
 21 | 
 22 |     const void* rgb = parameter->rgb;
 23 |     int componentRGB = parameter->componentRGB;
 24 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
 25 |     bool swizzleRGB = parameter->swizzleRGB;
 26 |     if (strideRGB < 0)
 27 |     {
 28 |         rgb = (char*)rgb - (strideRGB * (height - 1));
 29 |     }
 30 | 
 31 |     void* y = parameter->y;
 32 |     void* u = parameter->u;
 33 |     void* v = parameter->v;
 34 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
 35 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
 36 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
 37 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
 38 |     int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2;
 39 |     int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2;
 40 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
 41 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
 42 |     bool videoRange = parameter->videoRange;
 43 | 
 44 |     u = u ? u : (char*)y + sizeY;
 45 |     v = v ? v : (char*)y + sizeY + sizeU;
 46 | 
 47 |     void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV);
 48 | 
 49 |     if (componentRGB == 3)
 50 |     {
 51 |         if (swizzleRGB)
 52 |         {
 53 |             if (videoRange)
 54 |             {
 55 |                 static auto select = rgb2yuv_select(3, true, false, false, true);
 56 |                 converter = select;
 57 |             }
 58 |             else
 59 |             {
 60 |                 static auto select = rgb2yuv_select(3, true, false, false, false);
 61 |                 converter = select;
 62 |             }
 63 |         }
 64 |         else
 65 |         {
 66 |             if (videoRange)
 67 |             {
 68 |                 static auto select = rgb2yuv_select(3, false, false, false, true);
 69 |                 converter = select;
 70 |             }
 71 |             else
 72 |             {
 73 |                 static auto select = rgb2yuv_select(3, false, false, false, false);
 74 |                 converter = select;
 75 |             }
 76 |         }
 77 |     }
 78 |     else if (componentRGB == 4)
 79 |     {
 80 |         if (swizzleRGB)
 81 |         {
 82 |             if (videoRange)
 83 |             {
 84 |                 static auto select = rgb2yuv_select(4, true, false, false, true);
 85 |                 converter = select;
 86 |             }
 87 |             else
 88 |             {
 89 |                 static auto select = rgb2yuv_select(4, true, false, false, false);
 90 |                 converter = select;
 91 |             }
 92 |         }
 93 |         else
 94 |         {
 95 |             if (videoRange)
 96 |             {
 97 |                 static auto select = rgb2yuv_select(4, false, false, false, true);
 98 |                 converter = select;
 99 |             }
100 |             else
101 |             {
102 |                 static auto select = rgb2yuv_select(4, false, false, false, false);
103 |                 converter = select;
104 |             }
105 |         }
106 |     }
107 |     else
108 |     {
109 |         return;
110 |     }
111 | 
112 |     converter(width, height, rgb, strideRGB, y, u, v, strideY, strideU, strideU);
113 | }
114 | //------------------------------------------------------------------------------
115 | void rgb2yuv_yv12(const rgb2yuv_parameter* parameter)
116 | {
117 |     int width = parameter->width;
118 |     int height = parameter->height;
119 | 
120 |     const void* rgb = parameter->rgb;
121 |     int componentRGB = parameter->componentRGB;
122 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
123 |     bool swizzleRGB = parameter->swizzleRGB;
124 |     if (strideRGB < 0)
125 |     {
126 |         rgb = (char*)rgb - (strideRGB * (height - 1));
127 |     }
128 | 
129 |     void* y = parameter->y;
130 |     void* u = parameter->u;
131 |     void* v = parameter->v;
132 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
133 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
134 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
135 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
136 |     int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2;
137 |     int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2;
138 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
139 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
140 |     bool videoRange = parameter->videoRange;
141 | 
142 |     u = u ? u : (char*)y + sizeY + sizeU;
143 |     v = v ? v : (char*)y + sizeY;
144 | 
145 |     void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV);
146 | 
147 |     if (componentRGB == 3)
148 |     {
149 |         if (swizzleRGB)
150 |         {
151 |             if (videoRange)
152 |             {
153 |                 static auto select = rgb2yuv_select(3, true, false, false, true);
154 |                 converter = select;
155 |             }
156 |             else
157 |             {
158 |                 static auto select = rgb2yuv_select(3, true, false, false, false);
159 |                 converter = select;
160 |             }
161 |         }
162 |         else
163 |         {
164 |             if (videoRange)
165 |             {
166 |                 static auto select = rgb2yuv_select(3, false, false, false, true);
167 |                 converter = select;
168 |             }
169 |             else
170 |             {
171 |                 static auto select = rgb2yuv_select(3, false, false, false, false);
172 |                 converter = select;
173 |             }
174 |         }
175 |     }
176 |     else if (componentRGB == 4)
177 |     {
178 |         if (swizzleRGB)
179 |         {
180 |             if (videoRange)
181 |             {
182 |                 static auto select = rgb2yuv_select(4, true, false, false, true);
183 |                 converter = select;
184 |             }
185 |             else
186 |             {
187 |                 static auto select = rgb2yuv_select(4, true, false, false, false);
188 |                 converter = select;
189 |             }
190 |         }
191 |         else
192 |         {
193 |             if (videoRange)
194 |             {
195 |                 static auto select = rgb2yuv_select(4, false, false, false, true);
196 |                 converter = select;
197 |             }
198 |             else
199 |             {
200 |                 static auto select = rgb2yuv_select(4, false, false, false, false);
201 |                 converter = select;
202 |             }
203 |         }
204 |     }
205 |     else
206 |     {
207 |         return;
208 |     }
209 | 
210 |     converter(width, height, rgb, strideRGB, y, u, v, strideY, strideU, strideU);
211 | }
212 | //------------------------------------------------------------------------------
213 | void rgb2yuv_nv12(const rgb2yuv_parameter* parameter)
214 | {
215 |     int width = parameter->width;
216 |     int height = parameter->height;
217 | 
218 |     const void* rgb = parameter->rgb;
219 |     int componentRGB = parameter->componentRGB;
220 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
221 |     bool swizzleRGB = parameter->swizzleRGB;
222 |     if (strideRGB < 0)
223 |     {
224 |         rgb = (char*)rgb - (strideRGB * (height - 1));
225 |     }
226 | 
227 |     void* y = parameter->y;
228 |     void* u = parameter->u;
229 |     void* v = parameter->v;
230 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
231 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
232 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
233 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
234 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
235 |     int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize);
236 |     bool videoRange = parameter->videoRange;
237 | 
238 |     u = u ? u : (char*)y + sizeY;
239 |     v = v ? v : (char*)y + sizeY + 1;
240 | 
241 |     void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV);
242 | 
243 |     if (componentRGB == 3)
244 |     {
245 |         if (swizzleRGB)
246 |         {
247 |             if (videoRange)
248 |             {
249 |                 static auto select = rgb2yuv_select(3, true, true, true, true);
250 |                 converter = select;
251 |             }
252 |             else
253 |             {
254 |                 static auto select = rgb2yuv_select(3, true, true, true, false);
255 |                 converter = select;
256 |             }
257 |         }
258 |         else
259 |         {
260 |             if (videoRange)
261 |             {
262 |                 static auto select = rgb2yuv_select(3, false, true, true, true);
263 |                 converter = select;
264 |             }
265 |             else
266 |             {
267 |                 static auto select = rgb2yuv_select(3, false, true, true, false);
268 |                 converter = select;
269 |             }
270 |         }
271 |     }
272 |     else if (componentRGB == 4)
273 |     {
274 |         if (swizzleRGB)
275 |         {
276 |             if (videoRange)
277 |             {
278 |                 static auto select = rgb2yuv_select(4, true, true, true, true);
279 |                 converter = select;
280 |             }
281 |             else
282 |             {
283 |                 static auto select = rgb2yuv_select(4, true, true, true, false);
284 |                 converter = select;
285 |             }
286 |         }
287 |         else
288 |         {
289 |             if (videoRange)
290 |             {
291 |                 static auto select = rgb2yuv_select(4, false, true, true, true);
292 |                 converter = select;
293 |             }
294 |             else
295 |             {
296 |                 static auto select = rgb2yuv_select(4, false, true, true, false);
297 |                 converter = select;
298 |             }
299 |         }
300 |     }
301 |     else
302 |     {
303 |         return;
304 |     }
305 | 
306 |     converter(width, height, rgb, strideRGB, y, u, v, strideY, strideY, strideY);
307 | }
308 | //------------------------------------------------------------------------------
309 | void rgb2yuv_nv21(const rgb2yuv_parameter* parameter)
310 | {
311 |     int width = parameter->width;
312 |     int height = parameter->height;
313 | 
314 |     const void* rgb = parameter->rgb;
315 |     int componentRGB = parameter->componentRGB;
316 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
317 |     bool swizzleRGB = parameter->swizzleRGB;
318 |     if (strideRGB < 0)
319 |     {
320 |         rgb = (char*)rgb - (strideRGB * (height - 1));
321 |     }
322 | 
323 |     void* y = parameter->y;
324 |     void* u = parameter->u;
325 |     void* v = parameter->v;
326 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
327 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
328 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
329 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
330 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
331 |     int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize);
332 |     bool videoRange = parameter->videoRange;
333 | 
334 |     u = u ? u : (char*)y + sizeY + 1;
335 |     v = v ? v : (char*)y + sizeY;
336 | 
337 |     void (*converter)(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV);
338 | 
339 |     if (componentRGB == 3)
340 |     {
341 |         if (swizzleRGB)
342 |         {
343 |             if (videoRange)
344 |             {
345 |                 static auto select = rgb2yuv_select(3, true, true, false, true);
346 |                 converter = select;
347 |             }
348 |             else
349 |             {
350 |                 static auto select = rgb2yuv_select(3, true, true, false, false);
351 |                 converter = select;
352 |             }
353 |         }
354 |         else
355 |         {
356 |             if (videoRange)
357 |             {
358 |                 static auto select = rgb2yuv_select(3, false, true, false, true);
359 |                 converter = select;
360 |             }
361 |             else
362 |             {
363 |                 static auto select = rgb2yuv_select(3, false, true, false, false);
364 |                 converter = select;
365 |             }
366 |         }
367 |     }
368 |     else if (componentRGB == 4)
369 |     {
370 |         if (swizzleRGB)
371 |         {
372 |             if (videoRange)
373 |             {
374 |                 static auto select = rgb2yuv_select(4, true, true, false, true);
375 |                 converter = select;
376 |             }
377 |             else
378 |             {
379 |                 static auto select = rgb2yuv_select(4, true, true, false, false);
380 |                 converter = select;
381 |             }
382 |         }
383 |         else
384 |         {
385 |             if (videoRange)
386 |             {
387 |                 static auto select = rgb2yuv_select(4, false, true, false, true);
388 |                 converter = select;
389 |             }
390 |             else
391 |             {
392 |                 static auto select = rgb2yuv_select(4, false, true, false, false);
393 |                 converter = select;
394 |             }
395 |         }
396 |     }
397 |     else
398 |     {
399 |         return;
400 |     }
401 | 
402 |     converter(width, height, rgb, strideRGB, y, u, v, strideY, strideY, strideY);
403 | }
404 | //------------------------------------------------------------------------------
405 | 


--------------------------------------------------------------------------------
/rgb2yuv.h:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : rgb2yuv Header
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #pragma once
  8 | 
  9 | #ifndef xxYUV_EXPORT
 10 | #define xxYUV_EXPORT
 11 | #endif
 12 | 
 13 | //------------------------------------------------------------------------------
 14 | typedef struct _rgb2yuv_parameter
 15 | {
 16 |     int width;
 17 |     int height;
 18 | 
 19 |     const void* rgb;
 20 |     int componentRGB;
 21 |     int strideRGB;
 22 |     bool swizzleRGB;
 23 | 
 24 |     void* y;
 25 |     void* u;
 26 |     void* v;
 27 |     int alignWidth;
 28 |     int alignHeight;
 29 |     int alignSize;
 30 |     int strideY;
 31 |     int strideU;
 32 |     int strideV;
 33 |     bool videoRange;
 34 | } rgb2yuv_parameter;
 35 | //------------------------------------------------------------------------------
 36 | xxYUV_EXPORT void rgb2yuv_yu12(const rgb2yuv_parameter* parameter);
 37 | xxYUV_EXPORT void rgb2yuv_yv12(const rgb2yuv_parameter* parameter);
 38 | xxYUV_EXPORT void rgb2yuv_nv12(const rgb2yuv_parameter* parameter);
 39 | xxYUV_EXPORT void rgb2yuv_nv21(const rgb2yuv_parameter* parameter);
 40 | //------------------------------------------------------------------------------
 41 | #ifndef xxYUV_DEPRECATED
 42 | //------------------------------------------------------------------------------
 43 | inline void rgb2yuv_yu12(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
 44 | {
 45 |     rgb2yuv_parameter parameter =
 46 |     {
 47 |         .width = width,
 48 |         .height = height,
 49 |         .rgb = rgb,
 50 |         .componentRGB = rgbWidth,
 51 |         .strideRGB = strideRGB,
 52 |         .swizzleRGB = rgbSwizzle,
 53 |         .y = yuv,
 54 |         .alignWidth = alignWidth,
 55 |         .alignHeight = alignHeight,
 56 |         .alignSize = alignSize,
 57 |         .videoRange = !fullRange,
 58 |     };
 59 |     rgb2yuv_yu12(&parameter);
 60 | }
 61 | //------------------------------------------------------------------------------
 62 | inline void rgb2yuv_yv12(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
 63 | {
 64 |     rgb2yuv_parameter parameter =
 65 |     {
 66 |         .width = width,
 67 |         .height = height,
 68 |         .rgb = rgb,
 69 |         .componentRGB = rgbWidth,
 70 |         .strideRGB = strideRGB,
 71 |         .swizzleRGB = rgbSwizzle,
 72 |         .y = yuv,
 73 |         .alignWidth = alignWidth,
 74 |         .alignHeight = alignHeight,
 75 |         .alignSize = alignSize,
 76 |         .videoRange = !fullRange,
 77 |     };
 78 |     rgb2yuv_yv12(&parameter);
 79 | }
 80 | //------------------------------------------------------------------------------
 81 | inline void rgb2yuv_nv12(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
 82 | {
 83 |     rgb2yuv_parameter parameter =
 84 |     {
 85 |         .width = width,
 86 |         .height = height,
 87 |         .rgb = rgb,
 88 |         .componentRGB = rgbWidth,
 89 |         .strideRGB = strideRGB,
 90 |         .swizzleRGB = rgbSwizzle,
 91 |         .y = yuv,
 92 |         .alignWidth = alignWidth,
 93 |         .alignHeight = alignHeight,
 94 |         .alignSize = alignSize,
 95 |         .videoRange = !fullRange,
 96 |     };
 97 |     rgb2yuv_nv12(&parameter);
 98 | }
 99 | //------------------------------------------------------------------------------
100 | inline void rgb2yuv_nv21(int width, int height, const void* rgb, void* yuv, int rgbWidth = 3, bool rgbSwizzle = false, bool fullRange = true, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
101 | {
102 |     rgb2yuv_parameter parameter =
103 |     {
104 |         .width = width,
105 |         .height = height,
106 |         .rgb = rgb,
107 |         .componentRGB = rgbWidth,
108 |         .strideRGB = strideRGB,
109 |         .swizzleRGB = rgbSwizzle,
110 |         .y = yuv,
111 |         .alignWidth = alignWidth,
112 |         .alignHeight = alignHeight,
113 |         .alignSize = alignSize,
114 |         .videoRange = !fullRange,
115 |     };
116 |     rgb2yuv_nv21(&parameter);
117 | }
118 | //------------------------------------------------------------------------------
119 | #endif
120 | //------------------------------------------------------------------------------
121 | 


--------------------------------------------------------------------------------
/rgb2yuv.inl:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : rgb2yuv Inline
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | // BT.709 - Video Range
  8 | //      R        G        B
  9 | // Y =  0.18275  0.61477  0.06200
 10 | // U = -0.10072 -0.33882  0.43931
 11 | // V =  0.43867 -0.40048 -0.04038
 12 | //
 13 | // BT.709 - Full Range
 14 | //      R        G        B
 15 | // Y =  0.21260  0.71520  0.07220
 16 | // U = -0.11412 -0.38392  0.49804
 17 | // V =  0.49804 -0.45237 -0.04567
 18 | #define fRY  0.21260
 19 | #define fGY  0.71520
 20 | #define fBY  0.07220
 21 | #define fRU -0.11412
 22 | #define fGU -0.38392
 23 | #define fBU  0.49804
 24 | #define fRV  0.49804
 25 | #define fGV -0.45237
 26 | #define fBV -0.04567
 27 | #define vRY  0.18275
 28 | #define vGY  0.61477
 29 | #define vBY  0.06200
 30 | #define vRU -0.10072
 31 | #define vGU -0.33882
 32 | #define vBU  0.43931
 33 | #define vRV  0.43867
 34 | #define vGV -0.40048
 35 | #define vBV -0.04038
 36 | 
 37 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
 38 | #   include <arm_neon.h>
 39 | #elif defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
 40 | #   include <immintrin.h>
 41 | #   include <tmmintrin.h>
 42 | #   if defined(__llvm__)
 43 | #       include <avxintrin.h>
 44 | #       include <avx2intrin.h>
 45 | #   endif
 46 | #   if defined(_MSC_VER) && !defined(__llvm__)
 47 | #       define _mm_shuffle_ps(a, b, c)      (__m128i&)_mm_shuffle_ps((__m128&)a, (__m128&)b, c)
 48 | #       define _mm256_shuffle_ps(a, b, c)   (__m256i&)_mm256_shuffle_ps((__m256&)a, (__m256&)b, c)
 49 | #       define _mm_movehl_ps(a, b)          (__m128i&)_mm_movehl_ps((__m128&)a, (__m128&)b)
 50 | #       define _mm_storel_pi(a, b)          _mm_storel_pi(a, (__m128&)b)
 51 | #       define _mm_storeh_pi(a, b)          _mm_storeh_pi(a, (__m128&)b)
 52 | #   endif
 53 | #endif
 54 | 
 55 | //------------------------------------------------------------------------------
 56 | template<int componentRGB, bool swizzleRGB, bool interleaved, bool firstU, bool videoRange>
 57 | void rgb2yuv(int width, int height, const void* rgb, int strideRGB, void* y, void* u, void* v, int strideY, int strideU, int strideV)
 58 | {
 59 |     int halfWidth = width >> 1;
 60 |     int halfHeight = height >> 1;
 61 | 
 62 |     int iR = swizzleRGB ? 2 : 0;
 63 |     int iG = 1;
 64 |     int iB = swizzleRGB ? 0 : 2;
 65 |     int iA = 3;
 66 | 
 67 |     int Y[3], U[3], V[3];
 68 |     if (videoRange)
 69 |     {
 70 |         Y[iR] = (int)(vRY * 256); U[iR] = (int)(vRU * 255); V[iR] = (int)(vRV * 255);
 71 |         Y[iG] = (int)(vGY * 256); U[iG] = (int)(vGU * 255); V[iG] = (int)(vGV * 255);
 72 |         Y[iB] = (int)(vBY * 256); U[iB] = (int)(vBU * 255); V[iB] = (int)(vBV * 255);
 73 |     }
 74 |     else
 75 |     {
 76 |         Y[iR] = (int)(fRY * 256); U[iR] = (int)(fRU * 255); V[iR] = (int)(fRV * 255);
 77 |         Y[iG] = (int)(fGY * 256); U[iG] = (int)(fGU * 255); V[iG] = (int)(fGV * 255);
 78 |         Y[iB] = (int)(fBY * 256); U[iB] = (int)(fBU * 255); V[iB] = (int)(fBV * 255);
 79 |     }
 80 | 
 81 |     for (int h = 0; h < halfHeight; ++h)
 82 |     {
 83 |         const unsigned char* rgb0 = (unsigned char*)rgb;
 84 |         const unsigned char* rgb1 = rgb0 + strideRGB;       rgb = rgb1 + strideRGB;
 85 |         unsigned char* y0 = (unsigned char*)y;
 86 |         unsigned char* y1 = y0 + strideY;                   y = y1 + strideY;
 87 |         unsigned char* u0 = (unsigned char*)u;              u = u0 + strideU;
 88 |         unsigned char* v0 = (unsigned char*)v;              v = v0 + strideV;
 89 | #if HAVE_NEON
 90 |         int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0;
 91 |         for (int w = 0; w < halfWidth8; ++w)
 92 |         {
 93 |             uint8x16x4_t rgb00 = vld4q_u8(rgb0);  rgb0 += 16 * 4;
 94 |             uint8x16x4_t rgb10 = vld4q_u8(rgb1);  rgb1 += 16 * 4;
 95 | 
 96 |             uint8x8_t r00 = vget_low_u8(rgb00.val[0]);
 97 |             uint8x8_t g00 = vget_low_u8(rgb00.val[1]);
 98 |             uint8x8_t b00 = vget_low_u8(rgb00.val[2]);
 99 |             uint8x8_t r01 = vget_high_u8(rgb00.val[0]);
100 |             uint8x8_t g01 = vget_high_u8(rgb00.val[1]);
101 |             uint8x8_t b01 = vget_high_u8(rgb00.val[2]);
102 |             uint8x8_t r10 = vget_low_u8(rgb10.val[0]);
103 |             uint8x8_t g10 = vget_low_u8(rgb10.val[1]);
104 |             uint8x8_t b10 = vget_low_u8(rgb10.val[2]);
105 |             uint8x8_t r11 = vget_high_u8(rgb10.val[0]);
106 |             uint8x8_t g11 = vget_high_u8(rgb10.val[1]);
107 |             uint8x8_t b11 = vget_high_u8(rgb10.val[2]);
108 | 
109 |             uint8x8_t y00 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r00, vdup_n_u8(Y[0])), g00, vdup_n_u8(Y[1])), b00, vdup_n_u8(Y[2])), 8);
110 |             uint8x8_t y01 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r01, vdup_n_u8(Y[0])), g01, vdup_n_u8(Y[1])), b01, vdup_n_u8(Y[2])), 8);
111 |             uint8x8_t y10 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r10, vdup_n_u8(Y[0])), g10, vdup_n_u8(Y[1])), b10, vdup_n_u8(Y[2])), 8);
112 |             uint8x8_t y11 = vqshrn_n_u16(vmlal_u8(vmlal_u8(vmull_u8(r11, vdup_n_u8(Y[0])), g11, vdup_n_u8(Y[1])), b11, vdup_n_u8(Y[2])), 8);
113 |             uint8x16_t y000 = vcombine_u8(y00, y01);
114 |             uint8x16_t y100 = vcombine_u8(y10, y11);
115 |             if (videoRange)
116 |             {
117 |                 y000 = vqaddq_u8(vcombine_u8(y00, y01), vdupq_n_u8(16));
118 |                 y100 = vqaddq_u8(vcombine_u8(y10, y11), vdupq_n_u8(16));
119 |             }
120 |             else
121 |             {
122 |                 y000 = vcombine_u8(y00, y01);
123 |                 y100 = vcombine_u8(y10, y11);
124 |             }
125 | 
126 |             int16x8_t r000 = vpadalq_u8(vpaddlq_u8(rgb00.val[0]), rgb10.val[0]);
127 |             int16x8_t g000 = vpadalq_u8(vpaddlq_u8(rgb00.val[1]), rgb10.val[1]);
128 |             int16x8_t b000 = vpadalq_u8(vpaddlq_u8(rgb00.val[2]), rgb10.val[2]);
129 | 
130 |             uint8x8_t u00 = vrshrn_n_s16(vmlaq_s16(vmlaq_s16(vmulq_s16(r000, vdupq_n_s16(U[0] >> 2)), g000, vdupq_n_s16(U[1] >> 2)), b000, vdupq_n_s16(U[2] >> 2)), 8);
131 |             uint8x8_t v00 = vrshrn_n_s16(vmlaq_s16(vmlaq_s16(vmulq_s16(r000, vdupq_n_s16(V[0] >> 2)), g000, vdupq_n_s16(V[1] >> 2)), b000, vdupq_n_s16(V[2] >> 2)), 8);
132 |             u00 = vadd_u8(u00, vdup_n_u8(128));
133 |             v00 = vadd_u8(v00, vdup_n_u8(128));
134 | 
135 |             vst1q_u8(y0, y000); y0 += 16;
136 |             vst1q_u8(y1, y100); y1 += 16;
137 |             if (interleaved)
138 |             {
139 |                 if (firstU)
140 |                 {
141 |                     uint8x8x2_t uv00 = vzip_u8(u00, v00);
142 |                     vst1q_u8(u0, vcombine_u8(uv00.val[0], uv00.val[1])); u0 += 16;
143 |                 }
144 |                 else
145 |                 {
146 |                     uint8x8x2_t uv00 = vzip_u8(v00, u00);
147 |                     vst1q_u8(v0, vcombine_u8(uv00.val[0], uv00.val[1])); v0 += 16;
148 |                 }
149 |             }
150 |             else
151 |             {
152 |                 vst1_u8(u0, u00); u0 += 8;
153 |                 vst1_u8(v0, v00); v0 += 8;
154 |             }
155 |         }
156 |         if (componentRGB == 4)
157 |             continue;
158 | #elif HAVE_AVX2
159 |         int halfWidth16 = (componentRGB == 4) ? halfWidth / 16 : 0;
160 |         for (int w = 0; w < halfWidth16; ++w)
161 |         {
162 |             __m256i rgb00[4] = { _mm256_loadu_si256((__m256i*)rgb0), _mm256_loadu_si256((__m256i*)rgb0 + 1), _mm256_loadu_si256((__m256i*)rgb0 + 2), _mm256_loadu_si256((__m256i*)rgb0 + 3) };  rgb0 += 32 * 4;
163 |             __m256i rgb10[4] = { _mm256_loadu_si256((__m256i*)rgb1), _mm256_loadu_si256((__m256i*)rgb1 + 1), _mm256_loadu_si256((__m256i*)rgb1 + 2), _mm256_loadu_si256((__m256i*)rgb1 + 3) };  rgb1 += 32 * 4;
164 | 
165 |             __m256i yy = _mm256_setr_epi8(Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
166 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
167 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
168 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
169 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
170 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
171 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
172 |                                           Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0);
173 |             __m256i yy000 = _mm256_maddubs_epi16(rgb00[0], yy);
174 |             __m256i yy001 = _mm256_maddubs_epi16(rgb00[1], yy);
175 |             __m256i yy010 = _mm256_maddubs_epi16(rgb00[2], yy);
176 |             __m256i yy011 = _mm256_maddubs_epi16(rgb00[3], yy);
177 |             __m256i yy100 = _mm256_maddubs_epi16(rgb10[0], yy);
178 |             __m256i yy101 = _mm256_maddubs_epi16(rgb10[1], yy);
179 |             __m256i yy110 = _mm256_maddubs_epi16(rgb10[2], yy);
180 |             __m256i yy111 = _mm256_maddubs_epi16(rgb10[3], yy);
181 |             __m256i y00 = _mm256_hadd_epi16(yy000, yy001);
182 |             __m256i y01 = _mm256_hadd_epi16(yy010, yy011);
183 |             __m256i y10 = _mm256_hadd_epi16(yy100, yy101);
184 |             __m256i y11 = _mm256_hadd_epi16(yy110, yy111);
185 |             y00 = _mm256_srli_epi16(y00, 7);
186 |             y01 = _mm256_srli_epi16(y01, 7);
187 |             y10 = _mm256_srli_epi16(y10, 7);
188 |             y11 = _mm256_srli_epi16(y11, 7);
189 |             __m256i y000 = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(y00, y01), _mm256_setr_epi32(0,4,1,5,2,6,3,7));
190 |             __m256i y100 = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(y10, y11), _mm256_setr_epi32(0,4,1,5,2,6,3,7));
191 |             if (videoRange)
192 |             {
193 |                 y000 = _mm256_adds_epu8(y000, _mm256_set1_epi8(16));
194 |                 y100 = _mm256_adds_epu8(y100, _mm256_set1_epi8(16));
195 |             }
196 | 
197 |             __m256i uv00 = _mm256_avg_epu8(rgb00[0], rgb10[0]);
198 |             __m256i uv01 = _mm256_avg_epu8(rgb00[1], rgb10[1]);
199 |             __m256i uv10 = _mm256_avg_epu8(rgb00[2], rgb10[2]);
200 |             __m256i uv11 = _mm256_avg_epu8(rgb00[3], rgb10[3]);
201 |             __m256i uv0 = _mm256_avg_epu8(_mm256_shuffle_ps(uv00, uv01, _MM_SHUFFLE(2,0,2,0)), _mm256_shuffle_ps(uv00, uv01, _MM_SHUFFLE(3,1,3,1)));
202 |             __m256i uv1 = _mm256_avg_epu8(_mm256_shuffle_ps(uv10, uv11, _MM_SHUFFLE(2,0,2,0)), _mm256_shuffle_ps(uv10, uv11, _MM_SHUFFLE(3,1,3,1)));
203 |             __m256i uu = _mm256_setr_epi8(U[0], U[1], U[2], 0,
204 |                                           U[0], U[1], U[2], 0,
205 |                                           U[0], U[1], U[2], 0,
206 |                                           U[0], U[1], U[2], 0,
207 |                                           U[0], U[1], U[2], 0,
208 |                                           U[0], U[1], U[2], 0,
209 |                                           U[0], U[1], U[2], 0,
210 |                                           U[0], U[1], U[2], 0);
211 |             __m256i vv = _mm256_setr_epi8(V[0], V[1], V[2], 0,
212 |                                           V[0], V[1], V[2], 0,
213 |                                           V[0], V[1], V[2], 0,
214 |                                           V[0], V[1], V[2], 0,
215 |                                           V[0], V[1], V[2], 0,
216 |                                           V[0], V[1], V[2], 0,
217 |                                           V[0], V[1], V[2], 0,
218 |                                           V[0], V[1], V[2], 0);
219 |             __m256i uu00 = _mm256_maddubs_epi16(uv0, uu);
220 |             __m256i uu01 = _mm256_maddubs_epi16(uv1, uu);
221 |             __m256i vv00 = _mm256_maddubs_epi16(uv0, vv);
222 |             __m256i vv01 = _mm256_maddubs_epi16(uv1, vv);
223 |             __m256i uu02 = _mm256_hadd_epi16(uu00, uu01);
224 |             __m256i vv02 = _mm256_hadd_epi16(vv00, vv01);
225 |             uu02 = _mm256_srai_epi16(uu02, 8);
226 |             vv02 = _mm256_srai_epi16(vv02, 8);
227 |             __m256i mask = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
228 |                                             0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
229 |             __m256i uv02 = _mm256_shuffle_epi8(_mm256_permute4x64_epi64(_mm256_packs_epi16(uu02, vv02), _MM_SHUFFLE(3,1,2,0)), mask);
230 |             uv02 = _mm256_sub_epi8(uv02, _mm256_set1_epi8(-128));
231 | 
232 |             _mm256_storeu_si256((__m256i*)y0, y000); y0 += 32;
233 |             _mm256_storeu_si256((__m256i*)y1, y100); y1 += 32;
234 |             if (interleaved)
235 |             {
236 |                 __m128i u00 = _mm256_extractf128_si256(uv02, 0);
237 |                 __m128i v00 = _mm256_extractf128_si256(uv02, 1);
238 |                 if (firstU)
239 |                 {
240 |                     __m256i uv00 = _mm256_setr_m128i(_mm_unpacklo_epi8(u00, v00), _mm_unpackhi_epi8(u00, v00));
241 |                     _mm256_storeu_si256((__m256i*)u0, uv00); u0 += 32;
242 |                 }
243 |                 else
244 |                 {
245 |                     __m256i uv00 = _mm256_setr_m128i(_mm_unpacklo_epi8(v00, u00), _mm_unpackhi_epi8(v00, u00));
246 |                     _mm256_storeu_si256((__m256i*)v0, uv00); v0 += 32;
247 |                 }
248 |             }
249 |             else
250 |             {
251 |                 _mm256_storeu2_m128i((__m128i*)v0, (__m128i*)u0, uv02);  u0 += 16; v0 += 16;
252 |             }
253 |         }
254 |         if (componentRGB == 4)
255 |             continue;
256 | #elif HAVE_SSE2 || HAVE_SSSE3
257 |         int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0;
258 |         for (int w = 0; w < halfWidth8; ++w)
259 |         {
260 |             __m128i rgb00[4] = { _mm_loadu_si128((__m128i*)rgb0), _mm_loadu_si128((__m128i*)rgb0 + 1), _mm_loadu_si128((__m128i*)rgb0 + 2), _mm_loadu_si128((__m128i*)rgb0 + 3) };  rgb0 += 16 * 4;
261 |             __m128i rgb10[4] = { _mm_loadu_si128((__m128i*)rgb1), _mm_loadu_si128((__m128i*)rgb1 + 1), _mm_loadu_si128((__m128i*)rgb1 + 2), _mm_loadu_si128((__m128i*)rgb1 + 3) };  rgb1 += 16 * 4;
262 | 
263 | #if HAVE_SSSE3
264 |             __m128i yy = _mm_setr_epi8(Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
265 |                                        Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
266 |                                        Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
267 |                                        Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0);
268 |             __m128i yy000 = _mm_maddubs_epi16(rgb00[0], yy);
269 |             __m128i yy001 = _mm_maddubs_epi16(rgb00[1], yy);
270 |             __m128i yy010 = _mm_maddubs_epi16(rgb00[2], yy);
271 |             __m128i yy011 = _mm_maddubs_epi16(rgb00[3], yy);
272 |             __m128i yy100 = _mm_maddubs_epi16(rgb10[0], yy);
273 |             __m128i yy101 = _mm_maddubs_epi16(rgb10[1], yy);
274 |             __m128i yy110 = _mm_maddubs_epi16(rgb10[2], yy);
275 |             __m128i yy111 = _mm_maddubs_epi16(rgb10[3], yy);
276 |             __m128i y00 = _mm_hadd_epi16(yy000, yy001);
277 |             __m128i y01 = _mm_hadd_epi16(yy010, yy011);
278 |             __m128i y10 = _mm_hadd_epi16(yy100, yy101);
279 |             __m128i y11 = _mm_hadd_epi16(yy110, yy111);
280 | #else
281 |             __m128i yy = _mm_setr_epi16(Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0,
282 |                                         Y[0] >> 1, Y[1] >> 1, Y[2] >> 1, 0);
283 |             __m128i yy000 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[0], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[0], __m128i()), yy));
284 |             __m128i yy001 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[1], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[1], __m128i()), yy));
285 |             __m128i yy010 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[2], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[2], __m128i()), yy));
286 |             __m128i yy011 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb00[3], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb00[3], __m128i()), yy));
287 |             __m128i yy100 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[0], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[0], __m128i()), yy));
288 |             __m128i yy101 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[1], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[1], __m128i()), yy));
289 |             __m128i yy110 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[2], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[2], __m128i()), yy));
290 |             __m128i yy111 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(rgb10[3], __m128i()), yy), _mm_madd_epi16(_mm_unpackhi_epi8(rgb10[3], __m128i()), yy));
291 |             __m128i y00 = _mm_packs_epi32(_mm_madd_epi16(yy000, _mm_set1_epi16(1)), _mm_madd_epi16(yy001, _mm_set1_epi16(1)));
292 |             __m128i y01 = _mm_packs_epi32(_mm_madd_epi16(yy010, _mm_set1_epi16(1)), _mm_madd_epi16(yy011, _mm_set1_epi16(1)));
293 |             __m128i y10 = _mm_packs_epi32(_mm_madd_epi16(yy100, _mm_set1_epi16(1)), _mm_madd_epi16(yy101, _mm_set1_epi16(1)));
294 |             __m128i y11 = _mm_packs_epi32(_mm_madd_epi16(yy110, _mm_set1_epi16(1)), _mm_madd_epi16(yy111, _mm_set1_epi16(1)));
295 | #endif
296 |             y00 = _mm_srli_epi16(y00, 7);
297 |             y01 = _mm_srli_epi16(y01, 7);
298 |             y10 = _mm_srli_epi16(y10, 7);
299 |             y11 = _mm_srli_epi16(y11, 7);
300 |             __m128i y000 = _mm_packus_epi16(y00, y01);
301 |             __m128i y100 = _mm_packus_epi16(y10, y11);
302 |             if (videoRange)
303 |             {
304 |                 y000 = _mm_adds_epu8(y000, _mm_set1_epi8(16));
305 |                 y100 = _mm_adds_epu8(y100, _mm_set1_epi8(16));
306 |             }
307 | 
308 |             __m128i uv00 = _mm_avg_epu8(rgb00[0], rgb10[0]);
309 |             __m128i uv01 = _mm_avg_epu8(rgb00[1], rgb10[1]);
310 |             __m128i uv10 = _mm_avg_epu8(rgb00[2], rgb10[2]);
311 |             __m128i uv11 = _mm_avg_epu8(rgb00[3], rgb10[3]);
312 |             __m128i uv0 = _mm_avg_epu8(_mm_shuffle_ps(uv00, uv01, _MM_SHUFFLE(2,0,2,0)), _mm_shuffle_ps(uv00, uv01, _MM_SHUFFLE(3,1,3,1)));
313 |             __m128i uv1 = _mm_avg_epu8(_mm_shuffle_ps(uv10, uv11, _MM_SHUFFLE(2,0,2,0)), _mm_shuffle_ps(uv10, uv11, _MM_SHUFFLE(3,1,3,1)));
314 | #if HAVE_SSSE3
315 |             __m128i uu = _mm_setr_epi8(U[0], U[1], U[2], 0,
316 |                                        U[0], U[1], U[2], 0,
317 |                                        U[0], U[1], U[2], 0,
318 |                                        U[0], U[1], U[2], 0);
319 |             __m128i vv = _mm_setr_epi8(V[0], V[1], V[2], 0,
320 |                                        V[0], V[1], V[2], 0,
321 |                                        V[0], V[1], V[2], 0,
322 |                                        V[0], V[1], V[2], 0);
323 |             __m128i uu00 = _mm_maddubs_epi16(uv0, uu);
324 |             __m128i uu01 = _mm_maddubs_epi16(uv1, uu);
325 |             __m128i vv00 = _mm_maddubs_epi16(uv0, vv);
326 |             __m128i vv01 = _mm_maddubs_epi16(uv1, vv);
327 |             __m128i u00 = _mm_hadd_epi16(uu00, uu01);
328 |             __m128i v00 = _mm_hadd_epi16(vv00, vv01);
329 | #else
330 |             __m128i uu = _mm_setr_epi16(U[0], U[1], U[2], 0,
331 |                                         U[0], U[1], U[2], 0);
332 |             __m128i vv = _mm_setr_epi16(V[0], V[1], V[2], 0,
333 |                                         V[0], V[1], V[2], 0);
334 |             __m128i uu00 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv0, __m128i()), uu), _mm_madd_epi16(_mm_unpackhi_epi8(uv0, __m128i()), uu));
335 |             __m128i uu01 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv1, __m128i()), uu), _mm_madd_epi16(_mm_unpackhi_epi8(uv1, __m128i()), uu));
336 |             __m128i vv00 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv0, __m128i()), vv), _mm_madd_epi16(_mm_unpackhi_epi8(uv0, __m128i()), vv));
337 |             __m128i vv01 = _mm_packs_epi32(_mm_madd_epi16(_mm_unpacklo_epi8(uv1, __m128i()), vv), _mm_madd_epi16(_mm_unpackhi_epi8(uv1, __m128i()), vv));
338 |             __m128i u00 = _mm_packs_epi32(_mm_madd_epi16(uu00, _mm_set1_epi16(1)), _mm_madd_epi16(uu01, _mm_set1_epi16(1)));
339 |             __m128i v00 = _mm_packs_epi32(_mm_madd_epi16(vv00, _mm_set1_epi16(1)), _mm_madd_epi16(vv01, _mm_set1_epi16(1)));
340 | #endif
341 |             u00 = _mm_srai_epi16(u00, 8);
342 |             v00 = _mm_srai_epi16(v00, 8);
343 |             __m128i uv02 = _mm_packs_epi16(u00, v00);
344 |             uv02 = _mm_sub_epi8(uv02, _mm_set1_epi8(-128));
345 | 
346 |             _mm_storeu_si128((__m128i*)y0, y000); y0 += 16;
347 |             _mm_storeu_si128((__m128i*)y1, y100); y1 += 16;
348 |             if (interleaved)
349 |             {
350 |                 u00 = uv02;
351 |                 v00 = _mm_movehl_ps(uv02, uv02);
352 |                 if (firstU)
353 |                 {
354 |                     __m128i uv00 = _mm_unpacklo_epi8(u00, v00);
355 |                     _mm_storeu_si128((__m128i*)u0, uv00); u0 += 16;
356 |                 }
357 |                 else
358 |                 {
359 |                     __m128i uv00 = _mm_unpacklo_epi8(v00, u00);
360 |                     _mm_storeu_si128((__m128i*)v0, uv00); v0 += 16;
361 |                 }
362 |             }
363 |             else
364 |             {
365 |                 _mm_storel_pi((__m64*)u0, uv02);    u0 += 8;
366 |                 _mm_storeh_pi((__m64*)v0, uv02);    v0 += 8;
367 |             }
368 |         }
369 |         if (componentRGB == 4)
370 |             continue;
371 | #endif
372 |         for (int w = 0; w < halfWidth; ++w)
373 |         {
374 |             int b00 = (componentRGB >= 1) ? rgb0[0] : 255;
375 |             int g00 = (componentRGB >= 2) ? rgb0[1] : 255;
376 |             int r00 = (componentRGB >= 3) ? rgb0[2] : 255;
377 |             int a00 = (componentRGB >= 4) ? rgb0[3] : 255; rgb0 += componentRGB;
378 |             int b01 = (componentRGB >= 1) ? rgb0[0] : 255;
379 |             int g01 = (componentRGB >= 2) ? rgb0[1] : 255;
380 |             int r01 = (componentRGB >= 3) ? rgb0[2] : 255;
381 |             int a01 = (componentRGB >= 4) ? rgb0[3] : 255; rgb0 += componentRGB;
382 |             int b10 = (componentRGB >= 1) ? rgb1[0] : 255;
383 |             int g10 = (componentRGB >= 2) ? rgb1[1] : 255;
384 |             int r10 = (componentRGB >= 3) ? rgb1[2] : 255;
385 |             int a10 = (componentRGB >= 4) ? rgb1[3] : 255; rgb1 += componentRGB;
386 |             int b11 = (componentRGB >= 1) ? rgb1[0] : 255;
387 |             int g11 = (componentRGB >= 2) ? rgb1[1] : 255;
388 |             int r11 = (componentRGB >= 3) ? rgb1[2] : 255;
389 |             int a11 = (componentRGB >= 4) ? rgb1[3] : 255; rgb1 += componentRGB;
390 | 
391 |             int r000 = (r00 + r01 + r10 + r11) / 4;
392 |             int g000 = (g00 + g01 + g10 + g11) / 4;
393 |             int b000 = (b00 + b01 + b10 + b11) / 4;
394 | 
395 |             int y00 = r00  * Y[0] + g00  * Y[1] + b00  * Y[2];
396 |             int y01 = r01  * Y[0] + g01  * Y[1] + b01  * Y[2];
397 |             int y10 = r10  * Y[0] + g10  * Y[1] + b10  * Y[2];
398 |             int y11 = r11  * Y[0] + g11  * Y[1] + b11  * Y[2];
399 |             int u00 = r000 * U[0] + g000 * U[1] + b000 * U[2];
400 |             int v00 = r000 * V[0] + g000 * V[1] + b000 * V[2];
401 | 
402 |             auto clamp = [](int value) -> unsigned char
403 |             {
404 |                 return (unsigned char)(value < 255 ? value < 0 ? 0 : value : 255);
405 |             };
406 | 
407 |             if (videoRange)
408 |             {
409 |                 (*y0++) = clamp((y00 >> 8) + 16);
410 |                 (*y0++) = clamp((y01 >> 8) + 16);
411 |                 (*y1++) = clamp((y10 >> 8) + 16);
412 |                 (*y1++) = clamp((y11 >> 8) + 16);
413 |             }
414 |             else
415 |             {
416 |                 (*y0++) = clamp(y00 >> 8);
417 |                 (*y0++) = clamp(y01 >> 8);
418 |                 (*y1++) = clamp(y10 >> 8);
419 |                 (*y1++) = clamp(y11 >> 8);
420 |             }
421 |             (*u0++) = clamp((u00 >> 8) + 128);
422 |             (*v0++) = clamp((v00 >> 8) + 128);
423 |             if (interleaved)
424 |             {
425 |                 u0++;
426 |                 v0++;
427 |             }
428 |         }
429 |     }
430 | }
431 | //------------------------------------------------------------------------------
432 | #ifndef rgb2yuv_select
433 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
434 |     rgb2yuv<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
435 | #endif
436 | //------------------------------------------------------------------------------
437 | #ifndef rgb2yuv
438 | //------------------------------------------------------------------------------
439 | #if defined(__llvm__)
440 | #define rgb2yuv_attribute(value) __attribute__((target(value)))
441 | #else
442 | #define rgb2yuv_attribute(value)
443 | #endif
444 | //------------------------------------------------------------------------------
445 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
446 | #define HAVE_NEON 1
447 | #define rgb2yuv rgb2yuv_attribute("neon") rgb2yuv_neon
448 | #include "rgb2yuv.inl"
449 | #undef rgb2yuv
450 | #undef HAVE_NEON
451 | #undef rgb2yuv_select
452 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
453 |     neon() ? rgb2yuv_neon<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
454 |     rgb2yuv<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
455 | #endif
456 | //------------------------------------------------------------------------------
457 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
458 | #define HAVE_SSE2 1
459 | #define rgb2yuv rgb2yuv_attribute("sse2") rgb2yuv_sse2
460 | #include "rgb2yuv.inl"
461 | #undef rgb2yuv
462 | #undef HAVE_SSE2
463 | #undef rgb2yuv_select
464 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
465 |     sse2() ? rgb2yuv_sse2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
466 |     rgb2yuv<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
467 | #endif
468 | //------------------------------------------------------------------------------
469 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
470 | #define HAVE_SSSE3 1
471 | #define rgb2yuv rgb2yuv_attribute("ssse3") rgb2yuv_ssse3
472 | #include "rgb2yuv.inl"
473 | #undef rgb2yuv
474 | #undef HAVE_SSSE3
475 | #undef rgb2yuv_select
476 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
477 |     ssse3() ? rgb2yuv_ssse3<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
478 |     sse2() ? rgb2yuv_sse2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
479 |     rgb2yuv<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
480 | #endif
481 | //------------------------------------------------------------------------------
482 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
483 | #define HAVE_AVX2 1
484 | #define rgb2yuv rgb2yuv_attribute("avx2") rgb2yuv_avx2
485 | #include "rgb2yuv.inl"
486 | #undef rgb2yuv
487 | #undef HAVE_AVX2
488 | #undef rgb2yuv_select
489 | #define rgb2yuv_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
490 |     avx2() ? rgb2yuv_avx2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
491 |     ssse3() ? rgb2yuv_ssse3<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
492 |     sse2() ? rgb2yuv_sse2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
493 |     rgb2yuv<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
494 | #endif
495 | //------------------------------------------------------------------------------
496 | #endif
497 | //------------------------------------------------------------------------------
498 | 


--------------------------------------------------------------------------------
/yuv.h:
--------------------------------------------------------------------------------
 1 | //==============================================================================
 2 | // xxYUV : yuv Header
 3 | //
 4 | // Copyright (c) 2020-2021 TAiGA
 5 | // https://github.com/metarutaiga/xxYUV
 6 | //==============================================================================
 7 | #pragma once
 8 | 
 9 | #ifndef xxYUV_EXPORT
10 | #define xxYUV_EXPORT
11 | #endif
12 | 
13 | //------------------------------------------------------------------------------
14 | #ifndef xxYUV_DEPRECATED
15 | #include "yuv2yuva.h"
16 | //------------------------------------------------------------------------------
17 | inline void yuv_yu12_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
18 | {
19 |     yuv2yuva_parameter parameter =
20 |     {
21 |         .width = width,
22 |         .height = height,
23 |         .y = input,
24 |         .alignWidth = alignWidth,
25 |         .alignHeight = alignHeight,
26 |         .alignSize = alignSize,
27 |         .output = output,
28 |         .strideOutput = strideOutput,
29 |         .swizzleOutput = yuvSwizzle,
30 |     };
31 |     yuv2yuva_yu12(&parameter);
32 | }
33 | //------------------------------------------------------------------------------
34 | inline void yuv_yv12_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
35 | {
36 |     yuv2yuva_parameter parameter =
37 |     {
38 |         .width = width,
39 |         .height = height,
40 |         .y = input,
41 |         .alignWidth = alignWidth,
42 |         .alignHeight = alignHeight,
43 |         .alignSize = alignSize,
44 |         .output = output,
45 |         .strideOutput = strideOutput,
46 |         .swizzleOutput = yuvSwizzle,
47 |     };
48 |     yuv2yuva_yv12(&parameter);
49 | }
50 | //------------------------------------------------------------------------------
51 | inline void yuv_nv12_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
52 | {
53 |     yuv2yuva_parameter parameter =
54 |     {
55 |         .width = width,
56 |         .height = height,
57 |         .y = input,
58 |         .alignWidth = alignWidth,
59 |         .alignHeight = alignHeight,
60 |         .alignSize = alignSize,
61 |         .output = output,
62 |         .strideOutput = strideOutput,
63 |         .swizzleOutput = yuvSwizzle,
64 |     };
65 |     yuv2yuva_nv12(&parameter);
66 | }
67 | //------------------------------------------------------------------------------
68 | inline void yuv_nv21_to_yuva(int width, int height, const void* input, void* output, bool yuvSwizzle = false, int strideOutput = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
69 | {
70 |     yuv2yuva_parameter parameter =
71 |     {
72 |         .width = width,
73 |         .height = height,
74 |         .y = input,
75 |         .alignWidth = alignWidth,
76 |         .alignHeight = alignHeight,
77 |         .alignSize = alignSize,
78 |         .output = output,
79 |         .strideOutput = strideOutput,
80 |         .swizzleOutput = yuvSwizzle,
81 |     };
82 |     yuv2yuva_nv21(&parameter);
83 | }
84 | //------------------------------------------------------------------------------
85 | #endif
86 | //------------------------------------------------------------------------------
87 | 


--------------------------------------------------------------------------------
/yuv2rgb.cpp:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : yuv2rgb Source
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #if defined(__llvm__)
  8 | #   pragma clang diagnostic ignored "-Wunused-variable"
  9 | #endif
 10 | #include "cpu.h"
 11 | #include "yuv2rgb.inl"
 12 | #include "yuv2rgb.h"
 13 | 
 14 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1))
 15 | 
 16 | //------------------------------------------------------------------------------
 17 | void yuv2rgb_yu12(const yuv2rgb_parameter* parameter)
 18 | {
 19 |     int width = parameter->width;
 20 |     int height = parameter->height;
 21 | 
 22 |     const void* y = parameter->y;
 23 |     const void* u = parameter->u;
 24 |     const void* v = parameter->v;
 25 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
 26 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
 27 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
 28 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
 29 |     int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2;
 30 |     int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2;
 31 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
 32 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
 33 |     bool videoRange = parameter->videoRange;
 34 | 
 35 |     void* rgb = parameter->rgb;
 36 |     int componentRGB = parameter->componentRGB;
 37 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
 38 |     bool swizzleRGB = parameter->swizzleRGB;
 39 |     if (strideRGB < 0)
 40 |     {
 41 |         rgb = (char*)rgb - (strideRGB * (height - 1));
 42 |     }
 43 | 
 44 |     u = u ? u : (char*)y + sizeY;
 45 |     v = v ? v : (char*)y + sizeY + sizeU;
 46 | 
 47 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB);
 48 | 
 49 |     if (componentRGB == 3)
 50 |     {
 51 |         if (swizzleRGB)
 52 |         {
 53 |             if (videoRange)
 54 |             {
 55 |                 static auto select = yuv2rgb_select(3, true, false, false, true);
 56 |                 converter = select;
 57 |             }
 58 |             else
 59 |             {
 60 |                 static auto select = yuv2rgb_select(3, true, false, false, false);
 61 |                 converter = select;
 62 |             }
 63 |         }
 64 |         else
 65 |         {
 66 |             if (videoRange)
 67 |             {
 68 |                 static auto select = yuv2rgb_select(3, false, false, false, true);
 69 |                 converter = select;
 70 |             }
 71 |             else
 72 |             {
 73 |                 static auto select = yuv2rgb_select(3, false, false, false, false);
 74 |                 converter = select;
 75 |             }
 76 |         }
 77 |     }
 78 |     else if (componentRGB == 4)
 79 |     {
 80 |         if (swizzleRGB)
 81 |         {
 82 |             if (videoRange)
 83 |             {
 84 |                 static auto select = yuv2rgb_select(4, true, false, false, true);
 85 |                 converter = select;
 86 |             }
 87 |             else
 88 |             {
 89 |                 static auto select = yuv2rgb_select(4, true, false, false, false);
 90 |                 converter = select;
 91 |             }
 92 |         }
 93 |         else
 94 |         {
 95 |             if (videoRange)
 96 |             {
 97 |                 static auto select = yuv2rgb_select(4, false, false, false, true);
 98 |                 converter = select;
 99 |             }
100 |             else
101 |             {
102 |                 static auto select = yuv2rgb_select(4, false, false, false, false);
103 |                 converter = select;
104 |             }
105 |         }
106 |     }
107 |     else
108 |     {
109 |         return;
110 |     }
111 | 
112 |     converter(width, height, y, u, v, strideY, strideU, strideU, rgb, strideRGB);
113 | }
114 | //------------------------------------------------------------------------------
115 | void yuv2rgb_yv12(const yuv2rgb_parameter* parameter)
116 | {
117 |     int width = parameter->width;
118 |     int height = parameter->height;
119 | 
120 |     const void* y = parameter->y;
121 |     const void* u = parameter->u;
122 |     const void* v = parameter->v;
123 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
124 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
125 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
126 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
127 |     int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2;
128 |     int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2;
129 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
130 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
131 |     bool videoRange = parameter->videoRange;
132 | 
133 |     void* rgb = parameter->rgb;
134 |     int componentRGB = parameter->componentRGB;
135 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
136 |     bool swizzleRGB = parameter->swizzleRGB;
137 |     if (strideRGB < 0)
138 |     {
139 |         rgb = (char*)rgb - (strideRGB * (height - 1));
140 |     }
141 | 
142 |     u = u ? u : (char*)y + sizeY + sizeU;
143 |     v = v ? v : (char*)y + sizeY;
144 | 
145 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB);
146 | 
147 |     if (componentRGB == 3)
148 |     {
149 |         if (swizzleRGB)
150 |         {
151 |             if (videoRange)
152 |             {
153 |                 static auto select = yuv2rgb_select(3, true, false, false, true);
154 |                 converter = select;
155 |             }
156 |             else
157 |             {
158 |                 static auto select = yuv2rgb_select(3, true, false, false, false);
159 |                 converter = select;
160 |             }
161 |         }
162 |         else
163 |         {
164 |             if (videoRange)
165 |             {
166 |                 static auto select = yuv2rgb_select(3, false, false, false, true);
167 |                 converter = select;
168 |             }
169 |             else
170 |             {
171 |                 static auto select = yuv2rgb_select(3, false, false, false, false);
172 |                 converter = select;
173 |             }
174 |         }
175 |     }
176 |     else if (componentRGB == 4)
177 |     {
178 |         if (swizzleRGB)
179 |         {
180 |             if (videoRange)
181 |             {
182 |                 static auto select = yuv2rgb_select(4, true, false, false, true);
183 |                 converter = select;
184 |             }
185 |             else
186 |             {
187 |                 static auto select = yuv2rgb_select(4, true, false, false, false);
188 |                 converter = select;
189 |             }
190 |         }
191 |         else
192 |         {
193 |             if (videoRange)
194 |             {
195 |                 static auto select = yuv2rgb_select(4, false, false, false, true);
196 |                 converter = select;
197 |             }
198 |             else
199 |             {
200 |                 static auto select = yuv2rgb_select(4, false, false, false, false);
201 |                 converter = select;
202 |             }
203 |         }
204 |     }
205 |     else
206 |     {
207 |         return;
208 |     }
209 | 
210 |     converter(width, height, y, u, v, strideY, strideU, strideU, rgb, strideRGB);
211 | }
212 | //------------------------------------------------------------------------------
213 | void yuv2rgb_nv12(const yuv2rgb_parameter* parameter)
214 | {
215 |     int width = parameter->width;
216 |     int height = parameter->height;
217 | 
218 |     const void* y = parameter->y;
219 |     const void* u = parameter->u;
220 |     const void* v = parameter->v;
221 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
222 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
223 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
224 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
225 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
226 |     int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize);
227 |     bool videoRange = parameter->videoRange;
228 | 
229 |     void* rgb = parameter->rgb;
230 |     int componentRGB = parameter->componentRGB;
231 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
232 |     bool swizzleRGB = parameter->swizzleRGB;
233 |     if (strideRGB < 0)
234 |     {
235 |         rgb = (char*)rgb - (strideRGB * (height - 1));
236 |     }
237 | 
238 |     u = u ? u : (char*)y + sizeY;
239 |     v = v ? v : (char*)y + sizeY + 1;
240 | 
241 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB);
242 | 
243 |     if (componentRGB == 3)
244 |     {
245 |         if (swizzleRGB)
246 |         {
247 |             if (videoRange)
248 |             {
249 |                 static auto select = yuv2rgb_select(3, true, true, true, true);
250 |                 converter = select;
251 |             }
252 |             else
253 |             {
254 |                 static auto select = yuv2rgb_select(3, true, true, true, false);
255 |                 converter = select;
256 |             }
257 |         }
258 |         else
259 |         {
260 |             if (videoRange)
261 |             {
262 |                 static auto select = yuv2rgb_select(3, false, true, true, true);
263 |                 converter = select;
264 |             }
265 |             else
266 |             {
267 |                 static auto select = yuv2rgb_select(3, false, true, true, false);
268 |                 converter = select;
269 |             }
270 |         }
271 |     }
272 |     else if (componentRGB == 4)
273 |     {
274 |         if (swizzleRGB)
275 |         {
276 |             if (videoRange)
277 |             {
278 |                 static auto select = yuv2rgb_select(4, true, true, true, true);
279 |                 converter = select;
280 |             }
281 |             else
282 |             {
283 |                 static auto select = yuv2rgb_select(4, true, true, true, false);
284 |                 converter = select;
285 |             }
286 |         }
287 |         else
288 |         {
289 |             if (videoRange)
290 |             {
291 |                 static auto select = yuv2rgb_select(4, false, true, true, true);
292 |                 converter = select;
293 |             }
294 |             else
295 |             {
296 |                 static auto select = yuv2rgb_select(4, false, true, true, false);
297 |                 converter = select;
298 |             }
299 |         }
300 |     }
301 |     else
302 |     {
303 |         return;
304 |     }
305 | 
306 |     converter(width, height, y, u, v, strideY, strideY, strideY, rgb, strideRGB);
307 | }
308 | //------------------------------------------------------------------------------
309 | void yuv2rgb_nv21(const yuv2rgb_parameter* parameter)
310 | {
311 |     int width = parameter->width;
312 |     int height = parameter->height;
313 | 
314 |     const void* y = parameter->y;
315 |     const void* u = parameter->u;
316 |     const void* v = parameter->v;
317 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
318 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
319 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
320 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
321 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
322 |     int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize);
323 |     bool videoRange = parameter->videoRange;
324 | 
325 |     void* rgb = parameter->rgb;
326 |     int componentRGB = parameter->componentRGB;
327 |     int strideRGB = parameter->strideRGB ? parameter->strideRGB : componentRGB * width;
328 |     bool swizzleRGB = parameter->swizzleRGB;
329 |     if (strideRGB < 0)
330 |     {
331 |         rgb = (char*)rgb - (strideRGB * (height - 1));
332 |     }
333 | 
334 |     u = u ? u : (char*)y + sizeY + 1;
335 |     v = v ? v : (char*)y + sizeY;
336 | 
337 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB);
338 | 
339 |     if (componentRGB == 3)
340 |     {
341 |         if (swizzleRGB)
342 |         {
343 |             if (videoRange)
344 |             {
345 |                 static auto select = yuv2rgb_select(3, true, true, false, true);
346 |                 converter = select;
347 |             }
348 |             else
349 |             {
350 |                 static auto select = yuv2rgb_select(3, true, true, false, false);
351 |                 converter = select;
352 |             }
353 |         }
354 |         else
355 |         {
356 |             if (videoRange)
357 |             {
358 |                 static auto select = yuv2rgb_select(3, false, true, false, true);
359 |                 converter = select;
360 |             }
361 |             else
362 |             {
363 |                 static auto select = yuv2rgb_select(3, false, true, false, false);
364 |                 converter = select;
365 |             }
366 |         }
367 |     }
368 |     else if (componentRGB == 4)
369 |     {
370 |         if (swizzleRGB)
371 |         {
372 |             if (videoRange)
373 |             {
374 |                 static auto select = yuv2rgb_select(4, true, true, false, true);
375 |                 converter = select;
376 |             }
377 |             else
378 |             {
379 |                 static auto select = yuv2rgb_select(4, true, true, false, false);
380 |                 converter = select;
381 |             }
382 |         }
383 |         else
384 |         {
385 |             if (videoRange)
386 |             {
387 |                 static auto select = yuv2rgb_select(4, false, true, false, true);
388 |                 converter = select;
389 |             }
390 |             else
391 |             {
392 |                 static auto select = yuv2rgb_select(4, false, true, false, false);
393 |                 converter = select;
394 |             }
395 |         }
396 |     }
397 |     else
398 |     {
399 |         return;
400 |     }
401 | 
402 |     converter(width, height, y, u, v, strideY, strideY, strideY, rgb, strideRGB);
403 | }
404 | //------------------------------------------------------------------------------
405 | 


--------------------------------------------------------------------------------
/yuv2rgb.h:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : yuv2rgb Header
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #pragma once
  8 | 
  9 | #ifndef xxYUV_EXPORT
 10 | #define xxYUV_EXPORT
 11 | #endif
 12 | 
 13 | //------------------------------------------------------------------------------
 14 | typedef struct _yuv2rgb_parameter
 15 | {
 16 |     int width;
 17 |     int height;
 18 | 
 19 |     const void* y;
 20 |     const void* u;
 21 |     const void* v;
 22 |     int alignWidth;
 23 |     int alignHeight;
 24 |     int alignSize;
 25 |     int strideY;
 26 |     int strideU;
 27 |     int strideV;
 28 |     bool videoRange;
 29 | 
 30 |     void* rgb;
 31 |     int componentRGB;
 32 |     int strideRGB;
 33 |     bool swizzleRGB;
 34 | } yuv2rgb_parameter;
 35 | //------------------------------------------------------------------------------
 36 | xxYUV_EXPORT void yuv2rgb_yu12(const yuv2rgb_parameter* parameter);
 37 | xxYUV_EXPORT void yuv2rgb_yv12(const yuv2rgb_parameter* parameter);
 38 | xxYUV_EXPORT void yuv2rgb_nv12(const yuv2rgb_parameter* parameter);
 39 | xxYUV_EXPORT void yuv2rgb_nv21(const yuv2rgb_parameter* parameter);
 40 | //------------------------------------------------------------------------------
 41 | #ifndef xxYUV_DEPRECATED
 42 | //------------------------------------------------------------------------------
 43 | inline void yuv2rgb_yu12(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
 44 | {
 45 |     yuv2rgb_parameter parameter =
 46 |     {
 47 |         .width = width,
 48 |         .height = height,
 49 |         .y = yuv,
 50 |         .alignWidth = alignWidth,
 51 |         .alignHeight = alignHeight,
 52 |         .alignSize = alignSize,
 53 |         .videoRange = !fullRange,
 54 |         .rgb = rgb,
 55 |         .componentRGB = rgbWidth,
 56 |         .strideRGB = strideRGB,
 57 |         .swizzleRGB = rgbSwizzle,
 58 |     };
 59 |     yuv2rgb_yu12(&parameter);
 60 | }
 61 | //------------------------------------------------------------------------------
 62 | inline void yuv2rgb_yv12(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
 63 | {
 64 |     yuv2rgb_parameter parameter =
 65 |     {
 66 |         .width = width,
 67 |         .height = height,
 68 |         .y = yuv,
 69 |         .alignWidth = alignWidth,
 70 |         .alignHeight = alignHeight,
 71 |         .alignSize = alignSize,
 72 |         .videoRange = !fullRange,
 73 |         .rgb = rgb,
 74 |         .componentRGB = rgbWidth,
 75 |         .strideRGB = strideRGB,
 76 |         .swizzleRGB = rgbSwizzle,
 77 |     };
 78 |     yuv2rgb_yv12(&parameter);
 79 | }
 80 | //------------------------------------------------------------------------------
 81 | inline void yuv2rgb_nv12(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
 82 | {
 83 |     yuv2rgb_parameter parameter =
 84 |     {
 85 |         .width = width,
 86 |         .height = height,
 87 |         .y = yuv,
 88 |         .alignWidth = alignWidth,
 89 |         .alignHeight = alignHeight,
 90 |         .alignSize = alignSize,
 91 |         .videoRange = !fullRange,
 92 |         .rgb = rgb,
 93 |         .componentRGB = rgbWidth,
 94 |         .strideRGB = strideRGB,
 95 |         .swizzleRGB = rgbSwizzle,
 96 |     };
 97 |     yuv2rgb_nv12(&parameter);
 98 | }
 99 | //------------------------------------------------------------------------------
100 | inline void yuv2rgb_nv21(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1)
101 | {
102 |     yuv2rgb_parameter parameter =
103 |     {
104 |         .width = width,
105 |         .height = height,
106 |         .y = yuv,
107 |         .alignWidth = alignWidth,
108 |         .alignHeight = alignHeight,
109 |         .alignSize = alignSize,
110 |         .videoRange = !fullRange,
111 |         .rgb = rgb,
112 |         .componentRGB = rgbWidth,
113 |         .strideRGB = strideRGB,
114 |         .swizzleRGB = rgbSwizzle,
115 |     };
116 |     yuv2rgb_nv21(&parameter);
117 | }
118 | //------------------------------------------------------------------------------
119 | #endif
120 | //------------------------------------------------------------------------------
121 | 


--------------------------------------------------------------------------------
/yuv2rgb.inl:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : yuv2rgb Inline
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | // BT.709 - Video Range
  8 | //     Y         U         V
  9 | // R = 1.164384  0.000000  1.792741
 10 | // G = 1.164384 -0.213249 -0.532909
 11 | // B = 1.164384  2.112402  0.000000
 12 | //
 13 | // BT.709 - Full Range
 14 | //     Y         U         V
 15 | // R = 1.000000  0.000000  1.581000
 16 | // G = 1.000000 -0.188062 -0.469967
 17 | // B = 1.000000  1.862906  0.000000
 18 | #define vY   1.164384
 19 | #define vUG -0.213249
 20 | #define vUB  2.112402
 21 | #define vVR  1.792741
 22 | #define vVG -0.532909
 23 | #define fY   1.000000
 24 | #define fUG -0.188062
 25 | #define fUB  1.862906
 26 | #define fVR  1.581000
 27 | #define fVG -0.469967
 28 | 
 29 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
 30 | #   include <arm_neon.h>
 31 | #   define NEON_FAST 1
 32 | #elif defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
 33 | #   include <immintrin.h>
 34 | #   if defined(__llvm__)
 35 | #       include <avxintrin.h>
 36 | #       include <avx2intrin.h>
 37 | #   endif
 38 | #   define _MM_TRANSPOSE4_EPI8(R0, R1, R2, R3) {    \
 39 |         __m128i T0, T1, T2, T3;                     \
 40 |         T0 = _mm_unpacklo_epi8(R0, R1);             \
 41 |         T1 = _mm_unpacklo_epi8(R2, R3);             \
 42 |         T2 = _mm_unpackhi_epi8(R0, R1);             \
 43 |         T3 = _mm_unpackhi_epi8(R2, R3);             \
 44 |         R0 = _mm_unpacklo_epi16(T0, T1);            \
 45 |         R1 = _mm_unpackhi_epi16(T0, T1);            \
 46 |         R2 = _mm_unpacklo_epi16(T2, T3);            \
 47 |         R3 = _mm_unpackhi_epi16(T2, T3);            \
 48 |     }
 49 | #   define _MM256_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \
 50 |         __m256i T0, T1, T2, T3;                     \
 51 |         T0 = _mm256_unpacklo_epi8(R0, R1);          \
 52 |         T1 = _mm256_unpacklo_epi8(R2, R3);          \
 53 |         T2 = _mm256_unpackhi_epi8(R0, R1);          \
 54 |         T3 = _mm256_unpackhi_epi8(R2, R3);          \
 55 |         R0 = _mm256_unpacklo_epi16(T0, T1);         \
 56 |         R1 = _mm256_unpackhi_epi16(T0, T1);         \
 57 |         R2 = _mm256_unpacklo_epi16(T2, T3);         \
 58 |         R3 = _mm256_unpackhi_epi16(T2, T3);         \
 59 |     }
 60 | #   define _MM256_TRANSPOSE4_SI128(R0, R1, R2, R3) {\
 61 |         __m256i T0, T1, T2, T3;                     \
 62 |         T0 = _mm256_permute2x128_si256(R0, R1, 32); \
 63 |         T1 = _mm256_permute2x128_si256(R0, R1, 49); \
 64 |         T2 = _mm256_permute2x128_si256(R2, R3, 32); \
 65 |         T3 = _mm256_permute2x128_si256(R2, R3, 49); \
 66 |         R0 = T0;                                    \
 67 |         R2 = T1;                                    \
 68 |         R1 = T2;                                    \
 69 |         R3 = T3;                                    \
 70 |     }
 71 | #   define _MM512_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \
 72 |         __m512i T0, T1, T2, T3;                     \
 73 |         T0 = _mm512_unpacklo_epi8(R0, R1);          \
 74 |         T1 = _mm512_unpacklo_epi8(R2, R3);          \
 75 |         T2 = _mm512_unpackhi_epi8(R0, R1);          \
 76 |         T3 = _mm512_unpackhi_epi8(R2, R3);          \
 77 |         R0 = _mm512_unpacklo_epi16(T0, T1);         \
 78 |         R1 = _mm512_unpackhi_epi16(T0, T1);         \
 79 |         R2 = _mm512_unpacklo_epi16(T2, T3);         \
 80 |         R3 = _mm512_unpackhi_epi16(T2, T3);         \
 81 |     }
 82 | #   define _MM512_TRANSPOSE4_SI128(R0, R1, R2, R3) {\
 83 |         __m512i T0, T1, T2, T3;                     \
 84 |         T0 = _mm512_shuffle_i32x4(R0, R1, 0x44);    \
 85 |         T1 = _mm512_shuffle_i32x4(R2, R3, 0x44);    \
 86 |         T2 = _mm512_shuffle_i32x4(R0, R1, 0xEE);    \
 87 |         T3 = _mm512_shuffle_i32x4(R2, R3, 0xEE);    \
 88 |         R0 = _mm512_shuffle_i32x4(T0, T1, 0x88);    \
 89 |         R1 = _mm512_shuffle_i32x4(T0, T1, 0xDD);    \
 90 |         R2 = _mm512_shuffle_i32x4(T2, T3, 0x88);    \
 91 |         R3 = _mm512_shuffle_i32x4(T2, T3, 0xDD);    \
 92 |     }
 93 | #endif
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | template<int componentRGB, bool swizzleRGB, bool interleaved, bool firstU, bool videoRange>
 97 | void yuv2rgb(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB)
 98 | {
 99 |     int halfWidth = width >> 1;
100 |     int halfHeight = height >> 1;
101 | 
102 |     int iR = swizzleRGB ? 2 : 0;
103 |     int iG = 1;
104 |     int iB = swizzleRGB ? 0 : 2;
105 |     int iA = 3;
106 | 
107 |     int Y, UG, UB, VR, VG;
108 |     if (videoRange)
109 |     {
110 |         Y = (int)(vY * 256);
111 |         UG = (int)(vUG * 255); UB = (int)(vUB * 255);
112 |         VR = (int)(vVR * 255); VG = (int)(vVG * 255);
113 |     }
114 |     else
115 |     {
116 |         Y = (int)(fY * 256);
117 |         UG = (int)(fUG * 255); UB = (int)(fUB * 255);
118 |         VR = (int)(fVR * 255); VG = (int)(fVG * 255);
119 |     }
120 | 
121 |     for (int h = 0; h < halfHeight; ++h)
122 |     {
123 |         const unsigned char* y0 = (unsigned char*)y;
124 |         const unsigned char* y1 = y0 + strideY;         y = y1 + strideY;
125 |         const unsigned char* u0 = (unsigned char*)u;    u = u0 + strideU;
126 |         const unsigned char* v0 = (unsigned char*)v;    v = v0 + strideV;
127 |         unsigned char* rgb0 = (unsigned char*)rgb;
128 |         unsigned char* rgb1 = rgb0 + strideRGB;         rgb = rgb1 + strideRGB;
129 | #if HAVE_NEON
130 |         int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0;
131 |         for (int w = 0; w < halfWidth8; ++w)
132 |         {
133 |             uint8x16_t y00lh = vld1q_u8(y0); y0 += 16;
134 |             uint8x16_t y10lh = vld1q_u8(y1); y1 += 16;
135 |             uint8x8_t y00;
136 |             uint8x8_t y01;
137 |             uint8x8_t y10;
138 |             uint8x8_t y11;
139 |             if (videoRange)
140 |             {
141 |                 y00lh = vqsubq_u8(y00lh, vdupq_n_u8(16));
142 |                 y10lh = vqsubq_u8(y10lh, vdupq_n_u8(16));
143 |                 y00 = vshrn_n_u16(vmull_u8(vget_low_u8(y00lh), vdup_n_u8(Y >> 1)), 7);
144 |                 y01 = vshrn_n_u16(vmull_u8(vget_high_u8(y00lh), vdup_n_u8(Y >> 1)), 7);
145 |                 y10 = vshrn_n_u16(vmull_u8(vget_low_u8(y10lh), vdup_n_u8(Y >> 1)), 7);
146 |                 y11 = vshrn_n_u16(vmull_u8(vget_high_u8(y10lh), vdup_n_u8(Y >> 1)), 7);
147 |             }
148 |             else
149 |             {
150 |                 y00 = vget_low_u8(y00lh);
151 |                 y01 = vget_high_u8(y00lh);
152 |                 y10 = vget_low_u8(y10lh);
153 |                 y11 = vget_high_u8(y10lh);
154 |             }
155 | 
156 |             int8x8_t u000;
157 |             int8x8_t v000;
158 |             if (interleaved)
159 |             {
160 |                 if (firstU)
161 |                 {
162 |                     int8x16_t uv00 = vld1q_u8(u0); u0 += 16;
163 |                     int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00));
164 |                     int8x16_t uv000 = vaddq_s8(vcombine_s8(uv00lh.val[0], uv00lh.val[1]), vdupq_n_s8(-128));
165 |                     u000 = vget_low_s8(uv000);
166 |                     v000 = vget_high_s8(uv000);
167 |                 }
168 |                 else
169 |                 {
170 |                     int8x16_t uv00 = vld1q_u8(v0); v0 += 16;
171 |                     int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00));
172 |                     int8x16_t uv000 = vaddq_s8(vcombine_s8(uv00lh.val[1], uv00lh.val[0]), vdupq_n_s8(-128));
173 |                     u000 = vget_low_s8(uv000);
174 |                     v000 = vget_high_s8(uv000);
175 |                 }
176 |             }
177 |             else
178 |             {
179 |                 int8x16_t uv000 = vaddq_s8(vcombine_s8(vld1_u8(u0), vld1_u8(v0)), vdupq_n_s8(-128)); u0 += 8; v0 += 8;
180 |                 u000 = vget_low_s8(uv000);
181 |                 v000 = vget_high_s8(uv000);
182 |             }
183 | 
184 | #if NEON_FAST
185 |             int16x8_t dR = vshrq_n_s16(                                    vmull_s8(v000, vdup_n_s8(VR >> 2)), 6);
186 |             int16x8_t dG = vshrq_n_s16(vmlal_s8(vmull_s8(u000, vdup_n_s8(UG >> 1)), v000, vdup_n_s8(VG >> 1)), 7);
187 |             int16x8_t dB = vshrq_n_s16(         vmull_s8(u000, vdup_n_s8(UB >> 3)),                            5);
188 | #else
189 |             int16x8_t u00 = vshll_n_s8(u000, 7);
190 |             int16x8_t v00 = vshll_n_s8(v000, 7);
191 | 
192 |             int16x8_t dR =                                               vqdmulhq_s16(v00, vdupq_n_s16(VR));
193 |             int16x8_t dG = vaddq_s16(vqdmulhq_s16(u00, vdupq_n_s16(UG)), vqdmulhq_s16(v00, vdupq_n_s16(VG)));
194 |             int16x8_t dB =           vqdmulhq_s16(u00, vdupq_n_s16(UB));
195 | #endif
196 | 
197 |             uint16x8x2_t xR = vzipq_u16(vreinterpretq_u16_s16(dR), vreinterpretq_u16_s16(dR));
198 |             uint16x8x2_t xG = vzipq_u16(vreinterpretq_u16_s16(dG), vreinterpretq_u16_s16(dG));
199 |             uint16x8x2_t xB = vzipq_u16(vreinterpretq_u16_s16(dB), vreinterpretq_u16_s16(dB));
200 | 
201 |             uint8x16x4_t t;
202 |             uint8x16x4_t b;
203 | 
204 |             t.val[iR] = vcombine_u8(vqmovun_s16(vaddw_u8(xR.val[0], y00)), vqmovun_s16(vaddw_u8(xR.val[1], y01)));
205 |             t.val[iG] = vcombine_u8(vqmovun_s16(vaddw_u8(xG.val[0], y00)), vqmovun_s16(vaddw_u8(xG.val[1], y01)));
206 |             t.val[iB] = vcombine_u8(vqmovun_s16(vaddw_u8(xB.val[0], y00)), vqmovun_s16(vaddw_u8(xB.val[1], y01)));
207 |             t.val[iA] = vdupq_n_u8(255);
208 |             b.val[iR] = vcombine_u8(vqmovun_s16(vaddw_u8(xR.val[0], y10)), vqmovun_s16(vaddw_u8(xR.val[1], y11)));
209 |             b.val[iG] = vcombine_u8(vqmovun_s16(vaddw_u8(xG.val[0], y10)), vqmovun_s16(vaddw_u8(xG.val[1], y11)));
210 |             b.val[iB] = vcombine_u8(vqmovun_s16(vaddw_u8(xB.val[0], y10)), vqmovun_s16(vaddw_u8(xB.val[1], y11)));
211 |             b.val[iA] = vdupq_n_u8(255);
212 | 
213 |             vst4q_u8(rgb0, t);  rgb0 += 16 * 4;
214 |             vst4q_u8(rgb1, b);  rgb1 += 16 * 4;
215 |         }
216 |         if (componentRGB == 4)
217 |             continue;
218 | #elif HAVE_AVX512
219 |         int halfWidth16 = (componentRGB == 4) ? halfWidth / 32 : 0;
220 |         for (int w = 0; w < halfWidth16; ++w)
221 |         {
222 |             __m512i y00lh = _mm512_loadu_si512((__m512i*)y0); y0 += 64;
223 |             __m512i y10lh = _mm512_loadu_si512((__m512i*)y1); y1 += 64;
224 |             __m512i y00;
225 |             __m512i y01;
226 |             __m512i y10;
227 |             __m512i y11;
228 |             if (videoRange)
229 |             {
230 |                 y00lh = _mm512_subs_epu8(y00lh, _mm512_set1_epi8(16));
231 |                 y10lh = _mm512_subs_epu8(y10lh, _mm512_set1_epi8(16));
232 |                 y00 = _mm512_mulhi_epu16(_mm512_unpacklo_epi8(__m512i(), y00lh), _mm512_set1_epi16(Y));
233 |                 y01 = _mm512_mulhi_epu16(_mm512_unpackhi_epi8(__m512i(), y00lh), _mm512_set1_epi16(Y));
234 |                 y10 = _mm512_mulhi_epu16(_mm512_unpacklo_epi8(__m512i(), y10lh), _mm512_set1_epi16(Y));
235 |                 y11 = _mm512_mulhi_epu16(_mm512_unpackhi_epi8(__m512i(), y10lh), _mm512_set1_epi16(Y));
236 |             }
237 |             else
238 |             {
239 |                 y00 = _mm512_unpacklo_epi8(y00lh, __m512i());
240 |                 y01 = _mm512_unpackhi_epi8(y00lh, __m512i());
241 |                 y10 = _mm512_unpacklo_epi8(y10lh, __m512i());
242 |                 y11 = _mm512_unpackhi_epi8(y10lh, __m512i());
243 |             }
244 | 
245 |             __m512i u00;
246 |             __m512i v00;
247 |             if (interleaved)
248 |             {
249 |                 if (firstU)
250 |                 {
251 |                     __m512i uv00 = _mm512_loadu_si512((__m512i*)u0); u0 += 64;
252 |                     uv00 = _mm512_sub_epi8(uv00, _mm512_set1_epi8(-128));
253 |                     u00 = _mm512_slli_epi16(uv00, 8);
254 |                     v00 = uv00;
255 |                 }
256 |                 else
257 |                 {
258 |                     __m512i uv00 = _mm512_loadu_si512((__m512i*)v0); v0 += 64;
259 |                     uv00 = _mm512_sub_epi8(uv00, _mm512_set1_epi8(-128));
260 |                     u00 = uv00;
261 |                     v00 = _mm512_slli_epi16(uv00, 8);
262 |                 }
263 |             }
264 |             else
265 |             {
266 |                 __m256i u000 = _mm256_loadu_si256((__m256i*)u0); u0 += 32;
267 |                 __m256i v000 = _mm256_loadu_si256((__m256i*)v0); v0 += 32;
268 |                 u000 = _mm256_sub_epi8(u000, _mm256_set1_epi8(-128));
269 |                 v000 = _mm256_sub_epi8(v000, _mm256_set1_epi8(-128));
270 |                 u00 = _mm512_slli_epi16(_mm512_cvtepi8_epi16(u000), 8);
271 |                 v00 = _mm512_slli_epi16(_mm512_cvtepi8_epi16(v000), 8);
272 |             }
273 | 
274 |             __m512i dR =                                                                  _mm512_mulhi_epi16(v00, _mm512_set1_epi16(VR));
275 |             __m512i dG = _mm512_add_epi16(_mm512_mulhi_epi16(u00, _mm512_set1_epi16(UG)), _mm512_mulhi_epi16(v00, _mm512_set1_epi16(VG)));
276 |             __m512i dB =                  _mm512_mulhi_epi16(u00, _mm512_set1_epi16(UB));
277 | 
278 |             __m512i xR[2] = { _mm512_unpacklo_epi16(dR, dR), _mm512_unpackhi_epi16(dR, dR) };
279 |             __m512i xG[2] = { _mm512_unpacklo_epi16(dG, dG), _mm512_unpackhi_epi16(dG, dG) };
280 |             __m512i xB[2] = { _mm512_unpacklo_epi16(dB, dB), _mm512_unpackhi_epi16(dB, dB) };
281 | 
282 |             __m512i t[4];
283 |             __m512i b[4];
284 | 
285 |             t[iR] = _mm512_packus_epi16(_mm512_add_epi16(y00, xR[0]), _mm512_add_epi16(y01, xR[1]));
286 |             t[iG] = _mm512_packus_epi16(_mm512_add_epi16(y00, xG[0]), _mm512_add_epi16(y01, xG[1]));
287 |             t[iB] = _mm512_packus_epi16(_mm512_add_epi16(y00, xB[0]), _mm512_add_epi16(y01, xB[1]));
288 |             t[iA] = _mm512_set1_epi8(-1);
289 |             b[iR] = _mm512_packus_epi16(_mm512_add_epi16(y10, xR[0]), _mm512_add_epi16(y11, xR[1]));
290 |             b[iG] = _mm512_packus_epi16(_mm512_add_epi16(y10, xG[0]), _mm512_add_epi16(y11, xG[1]));
291 |             b[iB] = _mm512_packus_epi16(_mm512_add_epi16(y10, xB[0]), _mm512_add_epi16(y11, xB[1]));
292 |             b[iA] = _mm512_set1_epi8(-1);
293 | 
294 |             _MM512_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]);
295 |             _MM512_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]);
296 |             _MM512_TRANSPOSE4_SI128(t[0], t[1], t[2], t[3]);
297 |             _MM512_TRANSPOSE4_SI128(b[0], b[1], b[2], b[3]);
298 | 
299 |             _mm512_storeu_si512((__m512i*)rgb0 + 0, t[0]);
300 |             _mm512_storeu_si512((__m512i*)rgb0 + 1, t[1]);
301 |             _mm512_storeu_si512((__m512i*)rgb0 + 2, t[2]);
302 |             _mm512_storeu_si512((__m512i*)rgb0 + 3, t[3]); rgb0 += 16 * 16;
303 |             _mm512_storeu_si512((__m512i*)rgb1 + 0, b[0]);
304 |             _mm512_storeu_si512((__m512i*)rgb1 + 1, b[1]);
305 |             _mm512_storeu_si512((__m512i*)rgb1 + 2, b[2]);
306 |             _mm512_storeu_si512((__m512i*)rgb1 + 3, b[3]); rgb1 += 16 * 16;
307 |         }
308 |         if (componentRGB == 4)
309 |             continue;
310 | #elif HAVE_AVX2
311 |         int halfWidth16 = (componentRGB == 4) ? halfWidth / 16 : 0;
312 |         for (int w = 0; w < halfWidth16; ++w)
313 |         {
314 |             __m256i y00lh = _mm256_loadu_si256((__m256i*)y0); y0 += 32;
315 |             __m256i y10lh = _mm256_loadu_si256((__m256i*)y1); y1 += 32;
316 |             __m256i y00;
317 |             __m256i y01;
318 |             __m256i y10;
319 |             __m256i y11;
320 |             if (videoRange)
321 |             {
322 |                 y00lh = _mm256_subs_epu8(y00lh, _mm256_set1_epi8(16));
323 |                 y10lh = _mm256_subs_epu8(y10lh, _mm256_set1_epi8(16));
324 |                 y00 = _mm256_mulhi_epu16(_mm256_unpacklo_epi8(__m256i(), y00lh), _mm256_set1_epi16(Y));
325 |                 y01 = _mm256_mulhi_epu16(_mm256_unpackhi_epi8(__m256i(), y00lh), _mm256_set1_epi16(Y));
326 |                 y10 = _mm256_mulhi_epu16(_mm256_unpacklo_epi8(__m256i(), y10lh), _mm256_set1_epi16(Y));
327 |                 y11 = _mm256_mulhi_epu16(_mm256_unpackhi_epi8(__m256i(), y10lh), _mm256_set1_epi16(Y));
328 |             }
329 |             else
330 |             {
331 |                 y00 = _mm256_unpacklo_epi8(y00lh, __m256i());
332 |                 y01 = _mm256_unpackhi_epi8(y00lh, __m256i());
333 |                 y10 = _mm256_unpacklo_epi8(y10lh, __m256i());
334 |                 y11 = _mm256_unpackhi_epi8(y10lh, __m256i());
335 |             }
336 | 
337 |             __m256i u00;
338 |             __m256i v00;
339 |             if (interleaved)
340 |             {
341 |                 if (firstU)
342 |                 {
343 |                     __m256i uv00 = _mm256_loadu_si256((__m256i*)u0); u0 += 32;
344 |                     uv00 = _mm256_sub_epi8(uv00, _mm256_set1_epi8(-128));
345 |                     u00 = _mm256_slli_epi16(uv00, 8);
346 |                     v00 = uv00;
347 |                 }
348 |                 else
349 |                 {
350 |                     __m256i uv00 = _mm256_loadu_si256((__m256i*)v0); v0 += 32;
351 |                     uv00 = _mm256_sub_epi8(uv00, _mm256_set1_epi8(-128));
352 |                     u00 = uv00;
353 |                     v00 = _mm256_slli_epi16(uv00, 8);
354 |                 }
355 |             }
356 |             else
357 |             {
358 |                 __m128i u000 = _mm_loadu_si128((__m128i*)u0); u0 += 16;
359 |                 __m128i v000 = _mm_loadu_si128((__m128i*)v0); v0 += 16;
360 |                 u000 = _mm_sub_epi8(u000, _mm_set1_epi8(-128));
361 |                 v000 = _mm_sub_epi8(v000, _mm_set1_epi8(-128));
362 |                 u00 = _mm256_slli_epi16(_mm256_cvtepi8_epi16(u000), 8);
363 |                 v00 = _mm256_slli_epi16(_mm256_cvtepi8_epi16(v000), 8);
364 |             }
365 | 
366 |             __m256i dR =                                                                  _mm256_mulhi_epi16(v00, _mm256_set1_epi16(VR));
367 |             __m256i dG = _mm256_add_epi16(_mm256_mulhi_epi16(u00, _mm256_set1_epi16(UG)), _mm256_mulhi_epi16(v00, _mm256_set1_epi16(VG)));
368 |             __m256i dB =                  _mm256_mulhi_epi16(u00, _mm256_set1_epi16(UB));
369 | 
370 |             __m256i xR[2] = { _mm256_unpacklo_epi16(dR, dR), _mm256_unpackhi_epi16(dR, dR) };
371 |             __m256i xG[2] = { _mm256_unpacklo_epi16(dG, dG), _mm256_unpackhi_epi16(dG, dG) };
372 |             __m256i xB[2] = { _mm256_unpacklo_epi16(dB, dB), _mm256_unpackhi_epi16(dB, dB) };
373 | 
374 |             __m256i t[4];
375 |             __m256i b[4];
376 | 
377 |             t[iR] = _mm256_packus_epi16(_mm256_add_epi16(y00, xR[0]), _mm256_add_epi16(y01, xR[1]));
378 |             t[iG] = _mm256_packus_epi16(_mm256_add_epi16(y00, xG[0]), _mm256_add_epi16(y01, xG[1]));
379 |             t[iB] = _mm256_packus_epi16(_mm256_add_epi16(y00, xB[0]), _mm256_add_epi16(y01, xB[1]));
380 |             t[iA] = _mm256_set1_epi8(-1);
381 |             b[iR] = _mm256_packus_epi16(_mm256_add_epi16(y10, xR[0]), _mm256_add_epi16(y11, xR[1]));
382 |             b[iG] = _mm256_packus_epi16(_mm256_add_epi16(y10, xG[0]), _mm256_add_epi16(y11, xG[1]));
383 |             b[iB] = _mm256_packus_epi16(_mm256_add_epi16(y10, xB[0]), _mm256_add_epi16(y11, xB[1]));
384 |             b[iA] = _mm256_set1_epi8(-1);
385 | 
386 |             _MM256_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]);
387 |             _MM256_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]);
388 |             _MM256_TRANSPOSE4_SI128(t[0], t[1], t[2], t[3]);
389 |             _MM256_TRANSPOSE4_SI128(b[0], b[1], b[2], b[3]);
390 | 
391 |             _mm256_storeu_si256((__m256i*)rgb0 + 0, t[0]);
392 |             _mm256_storeu_si256((__m256i*)rgb0 + 1, t[1]);
393 |             _mm256_storeu_si256((__m256i*)rgb0 + 2, t[2]);
394 |             _mm256_storeu_si256((__m256i*)rgb0 + 3, t[3]); rgb0 += 16 * 8;
395 |             _mm256_storeu_si256((__m256i*)rgb1 + 0, b[0]);
396 |             _mm256_storeu_si256((__m256i*)rgb1 + 1, b[1]);
397 |             _mm256_storeu_si256((__m256i*)rgb1 + 2, b[2]);
398 |             _mm256_storeu_si256((__m256i*)rgb1 + 3, b[3]); rgb1 += 16 * 8;
399 |         }
400 |         if (componentRGB == 4)
401 |             continue;
402 | #elif HAVE_SSE2
403 |         int halfWidth8 = (componentRGB == 4) ? halfWidth / 8 : 0;
404 |         for (int w = 0; w < halfWidth8; ++w)
405 |         {
406 |             __m128i y00lh = _mm_loadu_si128((__m128i*)y0); y0 += 16;
407 |             __m128i y10lh = _mm_loadu_si128((__m128i*)y1); y1 += 16;
408 |             __m128i y00;
409 |             __m128i y01;
410 |             __m128i y10;
411 |             __m128i y11;
412 |             if (videoRange)
413 |             {
414 |                 y00lh = _mm_subs_epu8(y00lh, _mm_set1_epi8(16));
415 |                 y10lh = _mm_subs_epu8(y10lh, _mm_set1_epi8(16));
416 |                 y00 = _mm_mulhi_epu16(_mm_unpacklo_epi8(__m128i(), y00lh), _mm_set1_epi16(Y));
417 |                 y01 = _mm_mulhi_epu16(_mm_unpackhi_epi8(__m128i(), y00lh), _mm_set1_epi16(Y));
418 |                 y10 = _mm_mulhi_epu16(_mm_unpacklo_epi8(__m128i(), y10lh), _mm_set1_epi16(Y));
419 |                 y11 = _mm_mulhi_epu16(_mm_unpackhi_epi8(__m128i(), y10lh), _mm_set1_epi16(Y));
420 |             }
421 |             else
422 |             {
423 |                 y00 = _mm_unpacklo_epi8(y00lh, __m128i());
424 |                 y01 = _mm_unpackhi_epi8(y00lh, __m128i());
425 |                 y10 = _mm_unpacklo_epi8(y10lh, __m128i());
426 |                 y11 = _mm_unpackhi_epi8(y10lh, __m128i());
427 |             }
428 | 
429 |             __m128i u00;
430 |             __m128i v00;
431 |             if (interleaved)
432 |             {
433 |                 if (firstU)
434 |                 {
435 |                     __m128i uv00 = _mm_loadu_si128((__m128i*)u0); u0 += 16;
436 |                     uv00 = _mm_sub_epi8(uv00, _mm_set1_epi8(-128));
437 |                     u00 = _mm_slli_epi16(uv00, 8);
438 |                     v00 = uv00;
439 |                 }
440 |                 else
441 |                 {
442 |                     __m128i uv00 = _mm_loadu_si128((__m128i*)v0); v0 += 16;
443 |                     uv00 = _mm_sub_epi8(uv00, _mm_set1_epi8(-128));
444 |                     u00 = uv00;
445 |                     v00 = _mm_slli_epi16(uv00, 8);
446 |                 }
447 |             }
448 |             else
449 |             {
450 |                 __m128i u000 = _mm_loadl_epi64((__m128i*)u0); u0 += 8;
451 |                 __m128i v000 = _mm_loadl_epi64((__m128i*)v0); v0 += 8;
452 |                 u000 = _mm_sub_epi8(u000, _mm_set1_epi8(-128));
453 |                 v000 = _mm_sub_epi8(v000, _mm_set1_epi8(-128));
454 |                 u00 = _mm_unpacklo_epi8(__m128i(), u000);
455 |                 v00 = _mm_unpacklo_epi8(__m128i(), v000);
456 |             }
457 | 
458 |             __m128i dR =                                                         _mm_mulhi_epi16(v00, _mm_set1_epi16(VR));
459 |             __m128i dG = _mm_add_epi16(_mm_mulhi_epi16(u00, _mm_set1_epi16(UG)), _mm_mulhi_epi16(v00, _mm_set1_epi16(VG)));
460 |             __m128i dB =               _mm_mulhi_epi16(u00, _mm_set1_epi16(UB));
461 | 
462 |             __m128i xR[2] = { _mm_unpacklo_epi16(dR, dR), _mm_unpackhi_epi16(dR, dR) };
463 |             __m128i xG[2] = { _mm_unpacklo_epi16(dG, dG), _mm_unpackhi_epi16(dG, dG) };
464 |             __m128i xB[2] = { _mm_unpacklo_epi16(dB, dB), _mm_unpackhi_epi16(dB, dB) };
465 | 
466 |             __m128i t[4];
467 |             __m128i b[4];
468 | 
469 |             t[iR] = _mm_packus_epi16(_mm_add_epi16(y00, xR[0]), _mm_add_epi16(y01, xR[1]));
470 |             t[iG] = _mm_packus_epi16(_mm_add_epi16(y00, xG[0]), _mm_add_epi16(y01, xG[1]));
471 |             t[iB] = _mm_packus_epi16(_mm_add_epi16(y00, xB[0]), _mm_add_epi16(y01, xB[1]));
472 |             t[iA] = _mm_set1_epi8(-1);
473 |             b[iR] = _mm_packus_epi16(_mm_add_epi16(y10, xR[0]), _mm_add_epi16(y11, xR[1]));
474 |             b[iG] = _mm_packus_epi16(_mm_add_epi16(y10, xG[0]), _mm_add_epi16(y11, xG[1]));
475 |             b[iB] = _mm_packus_epi16(_mm_add_epi16(y10, xB[0]), _mm_add_epi16(y11, xB[1]));
476 |             b[iA] = _mm_set1_epi8(-1);
477 | 
478 |             _MM_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]);
479 |             _MM_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]);
480 | 
481 |             _mm_storeu_si128((__m128i*)rgb0 + 0, t[0]);
482 |             _mm_storeu_si128((__m128i*)rgb0 + 1, t[1]);
483 |             _mm_storeu_si128((__m128i*)rgb0 + 2, t[2]);
484 |             _mm_storeu_si128((__m128i*)rgb0 + 3, t[3]); rgb0 += 16 * 4;
485 |             _mm_storeu_si128((__m128i*)rgb1 + 0, b[0]);
486 |             _mm_storeu_si128((__m128i*)rgb1 + 1, b[1]);
487 |             _mm_storeu_si128((__m128i*)rgb1 + 2, b[2]);
488 |             _mm_storeu_si128((__m128i*)rgb1 + 3, b[3]); rgb1 += 16 * 4;
489 |         }
490 |         if (componentRGB == 4)
491 |             continue;
492 | #endif
493 |         for (int w = 0; w < halfWidth; ++w)
494 |         {
495 |             int y00 = (*y0++);
496 |             int y01 = (*y0++);
497 |             int y10 = (*y1++);
498 |             int y11 = (*y1++);
499 |             if (videoRange)
500 |             {
501 |                 y00 = ((y00 - 16) * Y) >> 8;
502 |                 y01 = ((y01 - 16) * Y) >> 8;
503 |                 y10 = ((y10 - 16) * Y) >> 8;
504 |                 y11 = ((y11 - 16) * Y) >> 8;
505 |             }
506 | 
507 |             int u00 = (*u0++) - 128;
508 |             int v00 = (*v0++) - 128;
509 |             if (interleaved)
510 |             {
511 |                 u0++;
512 |                 v0++;
513 |             }
514 | 
515 |             int dR = (           v00 * VR) >> 8;
516 |             int dG = (u00 * UG + v00 * VG) >> 8;
517 |             int dB = (u00 * UB           ) >> 8;
518 | 
519 |             auto clamp = [](int value) -> unsigned char
520 |             {
521 |                 return (unsigned char)(value < 255 ? value < 0 ? 0 : value : 255);
522 |             };
523 | 
524 |             if (componentRGB >= 1) rgb0[iR] = clamp(y00 + dR);
525 |             if (componentRGB >= 2) rgb0[iG] = clamp(y00 + dG);
526 |             if (componentRGB >= 3) rgb0[iB] = clamp(y00 + dB);
527 |             if (componentRGB >= 4) rgb0[iA] = 255;
528 |             rgb0 += componentRGB;
529 | 
530 |             if (componentRGB >= 1) rgb0[iR] = clamp(y01 + dR);
531 |             if (componentRGB >= 2) rgb0[iG] = clamp(y01 + dG);
532 |             if (componentRGB >= 3) rgb0[iB] = clamp(y01 + dB);
533 |             if (componentRGB >= 4) rgb0[iA] = 255;
534 |             rgb0 += componentRGB;
535 | 
536 |             if (componentRGB >= 1) rgb1[iR] = clamp(y10 + dR);
537 |             if (componentRGB >= 2) rgb1[iG] = clamp(y10 + dG);
538 |             if (componentRGB >= 3) rgb1[iB] = clamp(y10 + dB);
539 |             if (componentRGB >= 4) rgb1[iA] = 255;
540 |             rgb1 += componentRGB;
541 | 
542 |             if (componentRGB >= 1) rgb1[iR] = clamp(y11 + dR);
543 |             if (componentRGB >= 2) rgb1[iG] = clamp(y11 + dG);
544 |             if (componentRGB >= 3) rgb1[iB] = clamp(y11 + dB);
545 |             if (componentRGB >= 4) rgb1[iA] = 255;
546 |             rgb1 += componentRGB;
547 |         }
548 |     }
549 | }
550 | //------------------------------------------------------------------------------
551 | #ifndef yuv2rgb_select
552 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
553 |     yuv2rgb<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
554 | #endif
555 | //------------------------------------------------------------------------------
556 | #ifndef yuv2rgb
557 | //------------------------------------------------------------------------------
558 | #if defined(__llvm__)
559 | #define rgb2yuv_attribute(value) __attribute__((target(value)))
560 | #else
561 | #define rgb2yuv_attribute(value)
562 | #endif
563 | //------------------------------------------------------------------------------
564 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
565 | #define HAVE_NEON 1
566 | #define yuv2rgb rgb2yuv_attribute("neon") yuv2rgb_neon
567 | #include "yuv2rgb.inl"
568 | #undef yuv2rgb
569 | #undef HAVE_NEON
570 | #undef yuv2rgb_select
571 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
572 |     neon() ? yuv2rgb_neon<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
573 |     yuv2rgb<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
574 | #endif
575 | //------------------------------------------------------------------------------
576 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
577 | #define HAVE_SSE2 1
578 | #define yuv2rgb rgb2yuv_attribute("sse2") yuv2rgb_sse2
579 | #include "yuv2rgb.inl"
580 | #undef yuv2rgb
581 | #undef HAVE_SSE2
582 | #undef yuv2rgb_select
583 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
584 |     sse2() ? yuv2rgb_sse2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
585 |     yuv2rgb<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
586 | #endif
587 | //------------------------------------------------------------------------------
588 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
589 | #define HAVE_AVX2 1
590 | #define yuv2rgb rgb2yuv_attribute("avx2") yuv2rgb_avx2
591 | #include "yuv2rgb.inl"
592 | #undef yuv2rgb
593 | #undef HAVE_AVX2
594 | #undef yuv2rgb_select
595 | #define yuv2rgb_select(componentRGB, swizzleRGB, interleaved, firstU, videoRange) \
596 |     avx2() ? yuv2rgb_avx2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
597 |     sse2() ? yuv2rgb_sse2<componentRGB, swizzleRGB, interleaved, firstU, videoRange> : \
598 |     yuv2rgb<componentRGB, swizzleRGB, interleaved, firstU, videoRange>
599 | #endif
600 | //------------------------------------------------------------------------------
601 | #endif
602 | //------------------------------------------------------------------------------
603 | 


--------------------------------------------------------------------------------
/yuv2rgb_amx.cpp:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : yuv2rgb_amx Source
  3 | //
  4 | // Copyright (c) 2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #if defined(__llvm__)
  8 | #   pragma clang diagnostic ignored "-Wunused-variable"
  9 | #endif
 10 | #include "apple_amx.h"
 11 | #include "yuv2rgb.h"
 12 | 
 13 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1))
 14 | 
 15 | // BT.709 - Video Range
 16 | //     Y         U         V
 17 | // R = 1.164384  0.000000  1.792741
 18 | // G = 1.164384 -0.213249 -0.532909
 19 | // B = 1.164384  2.112402  0.000000
 20 | //
 21 | // BT.709 - Full Range
 22 | //     Y         U         V
 23 | // R = 1.000000  0.000000  1.581000
 24 | // G = 1.000000 -0.188062 -0.469967
 25 | // B = 1.000000  1.862906  0.000000
 26 | #define vY   1.164384
 27 | #define vUG -0.213249
 28 | #define vUB  2.112402
 29 | #define vVR  1.792741
 30 | #define vVG -0.532909
 31 | #define fY   1.000000
 32 | #define fUG -0.188062
 33 | #define fUB  1.862906
 34 | #define fVR  1.581000
 35 | #define fVG -0.469967
 36 | 
 37 | //------------------------------------------------------------------------------
 38 | template<int rgbWidth, bool rgbSwizzle, bool interleaved, bool firstU, bool fullRange>
 39 | void yuv2rgb_amx(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB)
 40 | {
 41 | #if defined(__APPLE__) && defined(__aarch64__)
 42 |     if (strideRGB < 0)
 43 |     {
 44 |         rgb = (char*)rgb - (strideRGB * (height - 1));
 45 |     }
 46 | 
 47 |     int halfWidth = width >> 1;
 48 |     int halfHeight = height >> 1;
 49 | 
 50 |     int iR = rgbSwizzle ? 2 : 0;
 51 |     int iG = 1;
 52 |     int iB = rgbSwizzle ? 0 : 2;
 53 |     int iA = 3;
 54 | 
 55 |     static constexpr int16_t Y = fullRange ? (int)(fY * 256) : (int)(vY * 256);
 56 |     static constexpr int16_t UG = fullRange ? (int)(fUG * 255) : (int)(vUG * 255);
 57 |     static constexpr int16_t UB = fullRange ? (int)(fUB * 255) : (int)(vUB * 255);
 58 |     static constexpr int16_t VR = fullRange ? (int)(fVR * 255) : (int)(vVR * 255);
 59 |     static constexpr int16_t VG = fullRange ? (int)(fVG * 255) : (int)(vVG * 255);
 60 | 
 61 |     static constexpr int16_t vector256[32] = { [0 ... 31] = 256 };
 62 |     static constexpr int16_t vectorN128[32] = { [0 ... 31] = -128 };
 63 |     static constexpr int16_t vectorY[32] = { [0 ... 31] = (int16_t)(Y >> 1) };
 64 |     static constexpr int16_t vectorVR[32] = { [0 ... 31] = (int16_t)(VR >> 2) };
 65 |     static constexpr int16_t vectorUG[32] = { [0 ... 31] = (int16_t)(UG >> 1) };
 66 |     static constexpr int16_t vectorVG[32] = { [0 ... 31] = (int16_t)(VG >> 1) };
 67 |     static constexpr int16_t vectorUB[32] = { [0 ... 31] = (int16_t)(UB >> 3) };
 68 | 
 69 |     amx_set();
 70 |     amx_ldy( /*.memory_offset = */(uint64_t)vector256,  .register_index = 1 );
 71 |     amx_ldy( /*.memory_offset = */(uint64_t)vectorN128, .register_index = 2 );
 72 |     amx_ldy( /*.memory_offset = */(uint64_t)vectorY,    .register_index = 3 );
 73 |     amx_ldy( /*.memory_offset = */(uint64_t)vectorVR,   .register_index = 4 );
 74 |     amx_ldy( /*.memory_offset = */(uint64_t)vectorUG,   .register_index = 5 );
 75 |     amx_ldy( /*.memory_offset = */(uint64_t)vectorVG,   .register_index = 6 );
 76 |     amx_ldy( /*.memory_offset = */(uint64_t)vectorUB,   .register_index = 7 );
 77 |     for (int h = 0; h < halfHeight; ++h)
 78 |     {
 79 |         const unsigned char* y0 = (unsigned char*)y;
 80 |         const unsigned char* y1 = y0 + strideY;         y = y1 + strideY;
 81 |         const unsigned char* u0 = (unsigned char*)u;    u = u0 + strideU;
 82 |         const unsigned char* v0 = (unsigned char*)v;    v = v0 + strideV;
 83 |         unsigned char* rgb0 = (unsigned char*)rgb;
 84 |         unsigned char* rgb1 = rgb0 + strideRGB;         rgb = rgb1 + strideRGB;
 85 |         int halfWidth128 = width / 128;
 86 |         for (int w = 0; w < halfWidth128; ++w)
 87 |         {
 88 |             // Clear
 89 |             amx_mac16( .skip_x = 1, .skip_y = 1, .skip_z = 1, .mode_32 = 1 );
 90 | 
 91 |             // Load
 92 |             amx_ldx( /*.memory_offset = */(uint64_t)y0 +  0, .register_index = 0 );
 93 |             amx_ldx( /*.memory_offset = */(uint64_t)y0 + 64, .register_index = 1 ); y0 += 128;
 94 |             amx_ldx( /*.memory_offset = */(uint64_t)y1 +  0, .register_index = 2 );
 95 |             amx_ldx( /*.memory_offset = */(uint64_t)y1 + 64, .register_index = 3 ); y1 += 128;
 96 |             amx_ldx( /*.memory_offset = */(uint64_t)u0 +  0, .register_index = 4 ); u0 += 64;
 97 |             amx_ldx( /*.memory_offset = */(uint64_t)v0 +  0, .register_index = 5 ); v0 += 64;
 98 | 
 99 |             // Y
100 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x0C0, .offset_z = 32, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
101 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x0C0, .offset_z = 40, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
102 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x0C0, .offset_z = 48, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
103 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x0C0, .offset_z = 34, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
104 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x0C0, .offset_z = 42, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
105 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x0C0, .offset_z = 50, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
106 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x0C0, .offset_z = 36, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
107 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x0C0, .offset_z = 44, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
108 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x0C0, .offset_z = 52, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
109 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x0C0, .offset_z = 38, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
110 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x0C0, .offset_z = 46, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
111 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x0C0, .offset_z = 54, .count_x = 1, .mask = 64, .extended = 11, .shift_right = 7 );
112 | 
113 |             // UV
114 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x080, .offset_z = 56, .count_x = 2, .extended = 12, .add = 1 );
115 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x080, .offset_z = 60, .count_x = 2, .extended = 12, .add = 1 );
116 |             amx_extrx( .offset_x = 0x000, .offset_z = 56 );
117 |             amx_extrx( .offset_x = 0x040, .offset_z = 57 );
118 |             amx_extrx( .offset_x = 0x080, .offset_z = 58 );
119 |             amx_extrx( .offset_x = 0x0C0, .offset_z = 59 );
120 |             amx_extrx( .offset_x = 0x100, .offset_z = 60 );
121 |             amx_extrx( .offset_x = 0x140, .offset_z = 61 );
122 |             amx_extrx( .offset_x = 0x180, .offset_z = 62 );
123 |             amx_extrx( .offset_x = 0x1C0, .offset_z = 63 );
124 |             amx_vecint( .offset_x = 0x200 - 2, .offset_y = 0, .offset_z = 56, .add = 1 );
125 |             amx_vecint( .offset_x = 0x040 - 2, .offset_y = 0, .offset_z = 57, .add = 1 );
126 |             amx_vecint( .offset_x = 0x080 - 2, .offset_y = 0, .offset_z = 58, .add = 1 );
127 |             amx_vecint( .offset_x = 0x0C0 - 2, .offset_y = 0, .offset_z = 59, .add = 1 );
128 |             amx_vecint( .offset_x = 0x100 - 2, .offset_y = 0, .offset_z = 60, .add = 1 );
129 |             amx_vecint( .offset_x = 0x140 - 2, .offset_y = 0, .offset_z = 61, .add = 1 );
130 |             amx_vecint( .offset_x = 0x180 - 2, .offset_y = 0, .offset_z = 62, .add = 1 );
131 |             amx_vecint( .offset_x = 0x1C0 - 2, .offset_y = 0, .offset_z = 63, .add = 1 );
132 |             amx_extrx( .offset_x = 0x000, .offset_z = 56 );
133 |             amx_extrx( .offset_x = 0x040, .offset_z = 57 );
134 |             amx_extrx( .offset_x = 0x080, .offset_z = 58 );
135 |             amx_extrx( .offset_x = 0x0C0, .offset_z = 59 );
136 |             amx_extrx( .offset_x = 0x100, .offset_z = 60 );
137 |             amx_extrx( .offset_x = 0x140, .offset_z = 61 );
138 |             amx_extrx( .offset_x = 0x180, .offset_z = 62 );
139 |             amx_extrx( .offset_x = 0x1C0, .offset_z = 63 );
140 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x140, .offset_z = 40, .shift_right = 7 );
141 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x140, .offset_z = 44, .shift_right = 7 );
142 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x140, .offset_z = 41, .shift_right = 7 );
143 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x140, .offset_z = 45, .shift_right = 7 );
144 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x140, .offset_z = 42, .shift_right = 7 );
145 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x140, .offset_z = 46, .shift_right = 7 );
146 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x140, .offset_z = 43, .shift_right = 7 );
147 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x140, .offset_z = 47, .shift_right = 7 );
148 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x1C0, .offset_z = 48, .shift_right = 5 );
149 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x1C0, .offset_z = 52, .shift_right = 5 );
150 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x1C0, .offset_z = 49, .shift_right = 5 );
151 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x1C0, .offset_z = 53, .shift_right = 5 );
152 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x1C0, .offset_z = 50, .shift_right = 5 );
153 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x1C0, .offset_z = 54, .shift_right = 5 );
154 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x1C0, .offset_z = 51, .shift_right = 5 );
155 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x1C0, .offset_z = 55, .shift_right = 5 );
156 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x100, .offset_z = 32, .shift_right = 6 );
157 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x100, .offset_z = 36, .shift_right = 6 );
158 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x100, .offset_z = 33, .shift_right = 6 );
159 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x100, .offset_z = 37, .shift_right = 6 );
160 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x100, .offset_z = 34, .shift_right = 6 );
161 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x100, .offset_z = 38, .shift_right = 6 );
162 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x100, .offset_z = 35, .shift_right = 6 );
163 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x100, .offset_z = 39, .shift_right = 6 );
164 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x180, .offset_z = 40, .shift_right = 7 );
165 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x180, .offset_z = 44, .shift_right = 7 );
166 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x180, .offset_z = 41, .shift_right = 7 );
167 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x180, .offset_z = 45, .shift_right = 7 );
168 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x180, .offset_z = 42, .shift_right = 7 );
169 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x180, .offset_z = 46, .shift_right = 7 );
170 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x180, .offset_z = 43, .shift_right = 7 );
171 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x180, .offset_z = 47, .shift_right = 7 );
172 | 
173 |             // RGBA
174 |             if (iR == 0)
175 |             {
176 |                 amx_extrx( .offset_x = 0x000, .offset_z = 32 );
177 |                 amx_extrx( .offset_x = 0x040, .offset_z = 33 );
178 |                 amx_extrx( .offset_x = 0x080, .offset_z = 34 );
179 |                 amx_extrx( .offset_x = 0x0C0, .offset_z = 35 );
180 |                 amx_extrx( .offset_x = 0x100, .offset_z = 36 );
181 |                 amx_extrx( .offset_x = 0x140, .offset_z = 37 );
182 |                 amx_extrx( .offset_x = 0x180, .offset_z = 38 );
183 |                 amx_extrx( .offset_x = 0x1C0, .offset_z = 39 );
184 |             }
185 |             else
186 |             {
187 |                 amx_extrx( .offset_x = 0x000, .offset_z = 48 );
188 |                 amx_extrx( .offset_x = 0x040, .offset_z = 49 );
189 |                 amx_extrx( .offset_x = 0x080, .offset_z = 50 );
190 |                 amx_extrx( .offset_x = 0x0C0, .offset_z = 51 );
191 |                 amx_extrx( .offset_x = 0x100, .offset_z = 52 );
192 |                 amx_extrx( .offset_x = 0x140, .offset_z = 53 );
193 |                 amx_extrx( .offset_x = 0x180, .offset_z = 54 );
194 |                 amx_extrx( .offset_x = 0x1C0, .offset_z = 55 );
195 |             }
196 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x000, .offset_z =  0, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
197 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x000, .offset_z =  4, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
198 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x000, .offset_z =  8, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
199 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x000, .offset_z = 12, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
200 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x000, .offset_z = 16, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
201 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x000, .offset_z = 20, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
202 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x000, .offset_z = 24, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
203 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x000, .offset_z = 28, .count_x = 1, .count_y = 1, .extended = 10, .add = 1 );
204 |             amx_extrx( .offset_x = 0x000, .offset_z = 40 );
205 |             amx_extrx( .offset_x = 0x040, .offset_z = 41 );
206 |             amx_extrx( .offset_x = 0x080, .offset_z = 42 );
207 |             amx_extrx( .offset_x = 0x0C0, .offset_z = 43 );
208 |             amx_extrx( .offset_x = 0x100, .offset_z = 44 );
209 |             amx_extrx( .offset_x = 0x140, .offset_z = 45 );
210 |             amx_extrx( .offset_x = 0x180, .offset_z = 46 );
211 |             amx_extrx( .offset_x = 0x1C0, .offset_z = 47 );
212 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x040, .offset_z =  0, .count_x = 1, .extended = 12 );
213 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x040, .offset_z =  4, .count_x = 1, .extended = 12 );
214 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x040, .offset_z =  8, .count_x = 1, .extended = 12 );
215 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x040, .offset_z = 12, .count_x = 1, .extended = 12 );
216 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x040, .offset_z = 16, .count_x = 1, .extended = 12 );
217 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x040, .offset_z = 20, .count_x = 1, .extended = 12 );
218 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x040, .offset_z = 24, .count_x = 1, .extended = 12 );
219 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x040, .offset_z = 28, .count_x = 1, .extended = 12 );
220 |             if (iB == 2)
221 |             {
222 |                 amx_extrx( .offset_x = 0x000, .offset_z = 48 );
223 |                 amx_extrx( .offset_x = 0x040, .offset_z = 49 );
224 |                 amx_extrx( .offset_x = 0x080, .offset_z = 50 );
225 |                 amx_extrx( .offset_x = 0x0C0, .offset_z = 51 );
226 |                 amx_extrx( .offset_x = 0x100, .offset_z = 52 );
227 |                 amx_extrx( .offset_x = 0x140, .offset_z = 53 );
228 |                 amx_extrx( .offset_x = 0x180, .offset_z = 54 );
229 |                 amx_extrx( .offset_x = 0x1C0, .offset_z = 55 );
230 |             }
231 |             else
232 |             {
233 |                 amx_extrx( .offset_x = 0x000, .offset_z = 32 );
234 |                 amx_extrx( .offset_x = 0x040, .offset_z = 33 );
235 |                 amx_extrx( .offset_x = 0x080, .offset_z = 34 );
236 |                 amx_extrx( .offset_x = 0x0C0, .offset_z = 35 );
237 |                 amx_extrx( .offset_x = 0x100, .offset_z = 36 );
238 |                 amx_extrx( .offset_x = 0x140, .offset_z = 37 );
239 |                 amx_extrx( .offset_x = 0x180, .offset_z = 38 );
240 |                 amx_extrx( .offset_x = 0x1C0, .offset_z = 39 );
241 |             }
242 |             amx_vecint( .offset_x = 0x1E0, .offset_y = 0x040, .offset_z =  0, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
243 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x040, .offset_z =  1, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
244 |             amx_vecint( .offset_x = 0x020, .offset_y = 0x040, .offset_z =  4, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
245 |             amx_vecint( .offset_x = 0x040, .offset_y = 0x040, .offset_z =  5, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
246 |             amx_vecint( .offset_x = 0x060, .offset_y = 0x040, .offset_z =  8, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
247 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x040, .offset_z =  9, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
248 |             amx_vecint( .offset_x = 0x0A0, .offset_y = 0x040, .offset_z = 12, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
249 |             amx_vecint( .offset_x = 0x0C0, .offset_y = 0x040, .offset_z = 13, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
250 |             amx_vecint( .offset_x = 0x0E0, .offset_y = 0x040, .offset_z = 16, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
251 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x040, .offset_z = 17, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
252 |             amx_vecint( .offset_x = 0x120, .offset_y = 0x040, .offset_z = 20, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
253 |             amx_vecint( .offset_x = 0x140, .offset_y = 0x040, .offset_z = 21, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
254 |             amx_vecint( .offset_x = 0x160, .offset_y = 0x040, .offset_z = 24, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
255 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x040, .offset_z = 25, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
256 |             amx_vecint( .offset_x = 0x1A0, .offset_y = 0x040, .offset_z = 28, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
257 |             amx_vecint( .offset_x = 0x1C0, .offset_y = 0x040, .offset_z = 29, .count_x = 1, .mask = 1, .extended = 6, .shift_right = 8 );
258 | 
259 |             // Store
260 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 +   0, .register_index =  0 );
261 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 +  64, .register_index =  1 );
262 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 128, .register_index =  4 );
263 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 192, .register_index =  5 );
264 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 256, .register_index =  8 );
265 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 320, .register_index =  9 );
266 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 384, .register_index = 12 );
267 |             amx_stz( /*.memory_offset = */(uint64_t)rgb0 + 448, .register_index = 13 ); rgb0 += 128 * 4;
268 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 +   0, .register_index = 16 );
269 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 +  64, .register_index = 17 );
270 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 128, .register_index = 20 );
271 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 192, .register_index = 21 );
272 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 256, .register_index = 24 );
273 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 320, .register_index = 25 );
274 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 384, .register_index = 28 );
275 |             amx_stz( /*.memory_offset = */(uint64_t)rgb1 + 448, .register_index = 29 ); rgb1 += 128 * 4;
276 |         }
277 |     }
278 |     amx_clr();
279 | #endif
280 | }
281 | //------------------------------------------------------------------------------
282 | void yyy2rgb_amx(int width, int height, const void* y, int strideY, void* rgb, int strideRGB)
283 | {
284 | #if defined(__APPLE__) && defined(__aarch64__)
285 |     int halfWidth = width >> 1;
286 |     int halfHeight = height >> 1;
287 | 
288 |     static constexpr int16_t vector257[32] = { [0 ... 31] = 257 };
289 | 
290 |     amx_set();
291 |     amx_ldy( .memory_offset = (uint64_t)vector257, .register_index = 1 );
292 |     for (int h = 0; h < halfHeight; ++h)
293 |     {
294 |         const unsigned char* y0 = (unsigned char*)y;
295 |         const unsigned char* y1 = y0 + strideY;         y = y1 + strideY;
296 |         unsigned char* rgb0 = (unsigned char*)rgb;
297 |         unsigned char* rgb1 = rgb0 + strideRGB;         rgb = rgb1 + strideRGB;
298 |         int halfWidth128 = width / 128;
299 |         for (int w = 0; w < halfWidth128; ++w)
300 |         {
301 |             // Clear
302 |             amx_mac16( .skip_x = 1, .skip_y = 1, .skip_z = 1, .mode_32 = 1 );
303 | 
304 |             // Load
305 |             amx_ldx( .memory_offset = (uint64_t)y0 +  0, .register_index = 0 );
306 |             amx_ldx( .memory_offset = (uint64_t)y0 + 64, .register_index = 2 );   y0 += 128;
307 |             amx_ldx( .memory_offset = (uint64_t)y1 +  0, .register_index = 4 );
308 |             amx_ldx( .memory_offset = (uint64_t)y1 + 64, .register_index = 6 );   y1 += 128;
309 | 
310 |             // Y
311 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x040, .offset_z =  0, .count_x = 2, .extended = 12 );
312 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x040, .offset_z =  8, .count_x = 2, .extended = 12 );
313 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x040, .offset_z = 16, .count_x = 2, .extended = 12 );
314 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x040, .offset_z = 24, .count_x = 2, .extended = 12 );
315 |             amx_vecint( .offset_x = 0x000, .offset_y = 0x000, .offset_z =  2, .count_x = 2, .extended = 11, .add = 1 );
316 |             amx_vecint( .offset_x = 0x080, .offset_y = 0x000, .offset_z = 10, .count_x = 2, .extended = 11, .add = 1 );
317 |             amx_vecint( .offset_x = 0x100, .offset_y = 0x000, .offset_z = 18, .count_x = 2, .extended = 11, .add = 1 );
318 |             amx_vecint( .offset_x = 0x180, .offset_y = 0x000, .offset_z = 26, .count_x = 2, .extended = 11, .add = 1 );
319 |             amx_vecint( .offset_x = 0x1E0, .offset_y = 0x000, .offset_z =  0, .count_x = 2, .extended = 11, .add = 1 );
320 |             amx_vecint( .offset_x = 0x060, .offset_y = 0x000, .offset_z =  8, .count_x = 2, .extended = 11, .add = 1 );
321 |             amx_vecint( .offset_x = 0x0E0, .offset_y = 0x000, .offset_z = 16, .count_x = 2, .extended = 11, .add = 1 );
322 |             amx_vecint( .offset_x = 0x160, .offset_y = 0x000, .offset_z = 24, .count_x = 2, .extended = 11, .add = 1 );
323 |             amx_vecint( .offset_x = 0x1E0, .offset_y = 0x000, .offset_z =  2, .count_x = 2, .extended = 12, .neg = 1, .add = 1 );
324 |             amx_vecint( .offset_x = 0x060, .offset_y = 0x000, .offset_z = 10, .count_x = 2, .extended = 12, .neg = 1, .add = 1 );
325 |             amx_vecint( .offset_x = 0x0E0, .offset_y = 0x000, .offset_z = 18, .count_x = 2, .extended = 12, .neg = 1, .add = 1 );
326 |             amx_vecint( .offset_x = 0x160, .offset_y = 0x000, .offset_z = 26, .count_x = 2, .extended = 12, .neg = 1, .add = 1 );
327 | 
328 |             // Store
329 |             amx_stz( .memory_offset = (uint64_t)rgb0 +   0, .register_index =  0 );
330 |             amx_stz( .memory_offset = (uint64_t)rgb0 +  64, .register_index =  1 );
331 |             amx_stz( .memory_offset = (uint64_t)rgb0 + 128, .register_index =  2 );
332 |             amx_stz( .memory_offset = (uint64_t)rgb0 + 192, .register_index =  3 );
333 |             amx_stz( .memory_offset = (uint64_t)rgb0 + 256, .register_index =  8 );
334 |             amx_stz( .memory_offset = (uint64_t)rgb0 + 320, .register_index =  9 );
335 |             amx_stz( .memory_offset = (uint64_t)rgb0 + 384, .register_index = 10 );
336 |             amx_stz( .memory_offset = (uint64_t)rgb0 + 448, .register_index = 11 );   rgb0 += 128 * 4;
337 |             amx_stz( .memory_offset = (uint64_t)rgb1 +   0, .register_index = 16 );
338 |             amx_stz( .memory_offset = (uint64_t)rgb1 +  64, .register_index = 17 );
339 |             amx_stz( .memory_offset = (uint64_t)rgb1 + 128, .register_index = 18 );
340 |             amx_stz( .memory_offset = (uint64_t)rgb1 + 192, .register_index = 19 );
341 |             amx_stz( .memory_offset = (uint64_t)rgb1 + 256, .register_index = 24 );
342 |             amx_stz( .memory_offset = (uint64_t)rgb1 + 320, .register_index = 25 );
343 |             amx_stz( .memory_offset = (uint64_t)rgb1 + 384, .register_index = 26 );
344 |             amx_stz( .memory_offset = (uint64_t)rgb1 + 448, .register_index = 27 );   rgb1 += 128 * 4;
345 |         }
346 |     }
347 |     amx_clr();
348 | #endif
349 | }
350 | //------------------------------------------------------------------------------
351 | void yuv2rgb_yu12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize)
352 | {
353 |     int strideY = align(width, alignWidth);
354 |     int strideU = align(width, alignWidth) / 2;
355 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
356 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
357 | 
358 |     if (strideRGB == 0)
359 |         strideRGB = rgbWidth * width;
360 | 
361 |     auto converter = yuv2rgb_amx<3, false, false, false, false>;
362 | 
363 |     if (rgbWidth == 3)
364 |     {
365 |         if (rgbSwizzle)
366 |         {
367 |             if (fullRange)
368 |                 converter = yuv2rgb_amx<3, true, false, false, true>;
369 |             else
370 |                 converter = yuv2rgb_amx<3, true, false, false, false>;
371 |         }
372 |         else
373 |         {
374 |             if (fullRange)
375 |                 converter = yuv2rgb_amx<3, false, false, false, true>;
376 |             else
377 |                 converter = yuv2rgb_amx<3, false, false, false, false>;
378 |         }
379 |     }
380 |     else if (rgbWidth == 4)
381 |     {
382 |         if (rgbSwizzle)
383 |         {
384 |             if (fullRange)
385 |                 converter = yuv2rgb_amx<4, true, false, false, true>;
386 |             else
387 |                 converter = yuv2rgb_amx<4, true, false, false, false>;
388 |         }
389 |         else
390 |         {
391 |             if (fullRange)
392 |                 converter = yuv2rgb_amx<4, false, false, false, true>;
393 |             else
394 |                 converter = yuv2rgb_amx<4, false, false, false, false>;
395 |         }
396 |     }
397 | 
398 |     converter(width, height, yuv, (char*)yuv + sizeY, (char*)yuv + sizeY + sizeU, strideY, strideU, strideU, rgb, strideRGB);
399 | }
400 | //------------------------------------------------------------------------------
401 | void yuv2rgb_yv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize)
402 | {
403 |     int strideY = align(width, alignWidth);
404 |     int strideU = align(width, alignWidth) / 2;
405 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
406 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
407 | 
408 |     if (strideRGB == 0)
409 |         strideRGB = rgbWidth * width;
410 | 
411 |     auto converter = yuv2rgb_amx<3, false, false, false, false>;
412 | 
413 |     if (rgbWidth == 3)
414 |     {
415 |         if (rgbSwizzle)
416 |         {
417 |             if (fullRange)
418 |                 converter = yuv2rgb_amx<3, true, false, false, true>;
419 |             else
420 |                 converter = yuv2rgb_amx<3, true, false, false, false>;
421 |         }
422 |         else
423 |         {
424 |             if (fullRange)
425 |                 converter = yuv2rgb_amx<3, false, false, false, true>;
426 |             else
427 |                 converter = yuv2rgb_amx<3, false, false, false, false>;
428 |         }
429 |     }
430 |     else if (rgbWidth == 4)
431 |     {
432 |         if (rgbSwizzle)
433 |         {
434 |             if (fullRange)
435 |                 converter = yuv2rgb_amx<4, true, false, false, true>;
436 |             else
437 |                 converter = yuv2rgb_amx<4, true, false, false, false>;
438 |         }
439 |         else
440 |         {
441 |             if (fullRange)
442 |                 converter = yuv2rgb_amx<4, false, false, false, true>;
443 |             else
444 |                 converter = yuv2rgb_amx<4, false, false, false, false>;
445 |         }
446 |     }
447 | 
448 |     converter(width, height, yuv, (char*)yuv + sizeY + sizeU, (char*)yuv + sizeY, strideY, strideU, strideU, rgb, strideRGB);
449 | }
450 | //------------------------------------------------------------------------------
451 | void yuv2rgb_nv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize)
452 | {
453 |     int strideYUV = align(width, alignWidth);
454 |     int sizeY = align(strideYUV * align(height, alignHeight), alignSize);
455 |     int sizeUV = align(strideYUV * align(height, alignHeight) / 2, alignSize);
456 | 
457 |     if (strideRGB == 0)
458 |         strideRGB = rgbWidth * width;
459 | 
460 |     auto converter = yuv2rgb_amx<3, false, false, false, false>;
461 | 
462 |     if (rgbWidth == 3)
463 |     {
464 |         if (rgbSwizzle)
465 |         {
466 |             if (fullRange)
467 |                 converter = yuv2rgb_amx<3, true, true, true, true>;
468 |             else
469 |                 converter = yuv2rgb_amx<3, true, true, true, false>;
470 |         }
471 |         else
472 |         {
473 |             if (fullRange)
474 |                 converter = yuv2rgb_amx<3, false, true, true, true>;
475 |             else
476 |                 converter = yuv2rgb_amx<3, false, true, true, false>;
477 |         }
478 |     }
479 |     else if (rgbWidth == 4)
480 |     {
481 |         if (rgbSwizzle)
482 |         {
483 |             if (fullRange)
484 |                 converter = yuv2rgb_amx<4, true, true, true, true>;
485 |             else
486 |                 converter = yuv2rgb_amx<4, true, true, true, false>;
487 |         }
488 |         else
489 |         {
490 |             if (fullRange)
491 |                 converter = yuv2rgb_amx<4, false, true, true, true>;
492 |             else
493 |                 converter = yuv2rgb_amx<4, false, true, true, false>;
494 |         }
495 |     }
496 | 
497 |     converter(width, height, yuv, (char*)yuv + sizeY, (char*)yuv + sizeY + 1, strideYUV, strideYUV, strideYUV, rgb, strideRGB);
498 | }
499 | //------------------------------------------------------------------------------
500 | void yuv2rgb_nv21_amx(int width, int height, const void* yuv, void* rgb, bool fullRange, int rgbWidth, bool rgbSwizzle, int strideRGB, int alignWidth, int alignHeight, int alignSize)
501 | {
502 |     int strideYUV = align(width, alignWidth);
503 |     int sizeY = align(strideYUV * align(height, alignHeight), alignSize);
504 |     int sizeUV = align(strideYUV * align(height, alignHeight) / 2, alignSize);
505 | 
506 |     if (strideRGB == 0)
507 |         strideRGB = rgbWidth * width;
508 | 
509 |     auto converter = yuv2rgb_amx<3, false, false, false, false>;
510 | 
511 |     if (rgbWidth == 3)
512 |     {
513 |         if (rgbSwizzle)
514 |         {
515 |             if (fullRange)
516 |                 converter = yuv2rgb_amx<3, true, true, false, true>;
517 |             else
518 |                 converter = yuv2rgb_amx<3, true, true, false, false>;
519 |         }
520 |         else
521 |         {
522 |             if (fullRange)
523 |                 converter = yuv2rgb_amx<3, false, true, false, true>;
524 |             else
525 |                 converter = yuv2rgb_amx<3, false, true, false, false>;
526 |         }
527 |     }
528 |     else if (rgbWidth == 4)
529 |     {
530 |         if (rgbSwizzle)
531 |         {
532 |             if (fullRange)
533 |                 converter = yuv2rgb_amx<4, true, true, false, true>;
534 |             else
535 |                 converter = yuv2rgb_amx<4, true, true, false, false>;
536 |         }
537 |         else
538 |         {
539 |             if (fullRange)
540 |                 converter = yuv2rgb_amx<4, false, true, false, true>;
541 |             else
542 |                 converter = yuv2rgb_amx<4, false, true, false, false>;
543 |         }
544 |     }
545 | 
546 |     converter(width, height, yuv, (char*)yuv + sizeY + 1, (char*)yuv + sizeY, strideYUV, strideYUV, strideYUV, rgb, strideRGB);
547 | }
548 | //------------------------------------------------------------------------------
549 | 


--------------------------------------------------------------------------------
/yuv2rgb_amx.h:
--------------------------------------------------------------------------------
 1 | //==============================================================================
 2 | // xxYUV : yuv2rgb_amx Header
 3 | //
 4 | // Copyright (c) 2021 TAiGA
 5 | // https://github.com/metarutaiga/xxYUV
 6 | //==============================================================================
 7 | #pragma once
 8 | 
 9 | #ifndef xxYUV_EXPORT
10 | #define xxYUV_EXPORT
11 | #endif
12 | 
13 | //------------------------------------------------------------------------------
14 | template<int rgbWidth, bool rgbSwizzle, bool interleaved, bool firstU, bool fullRange>
15 | xxYUV_EXPORT void yuv2rgb_amx(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* rgb, int strideRGB);
16 | //------------------------------------------------------------------------------
17 | xxYUV_EXPORT void yyy2rgb_amx(int width, int height, const void* y, int strideY, void* rgb, int strideRGB);
18 | //------------------------------------------------------------------------------
19 | xxYUV_EXPORT void yuv2rgb_yu12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1);
20 | xxYUV_EXPORT void yuv2rgb_yv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1);
21 | xxYUV_EXPORT void yuv2rgb_nv12_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1);
22 | xxYUV_EXPORT void yuv2rgb_nv21_amx(int width, int height, const void* yuv, void* rgb, bool fullRange = true, int rgbWidth = 3, bool rgbSwizzle = false, int strideRGB = 0, int alignWidth = 16, int alignHeight = 1, int alignSize = 1);
23 | //------------------------------------------------------------------------------
24 | 


--------------------------------------------------------------------------------
/yuv2yuva.cpp:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : yuv2yuva Source
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #if defined(__llvm__)
  8 | #   pragma clang diagnostic ignored "-Wunused-variable"
  9 | #endif
 10 | #include "cpu.h"
 11 | #include "yuv2yuva.inl"
 12 | #include "yuv2yuva.h"
 13 | 
 14 | #define align(v, a) ((v) + ((a) - 1) & ~((a) - 1))
 15 | 
 16 | //------------------------------------------------------------------------------
 17 | void yuv2yuva_yu12(const yuv2yuva_parameter* parameter)
 18 | {
 19 |     int width = parameter->width;
 20 |     int height = parameter->height;
 21 | 
 22 |     const void* y = parameter->y;
 23 |     const void* u = parameter->u;
 24 |     const void* v = parameter->v;
 25 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
 26 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
 27 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
 28 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
 29 |     int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2;
 30 |     int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2;
 31 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
 32 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
 33 | 
 34 |     void* output = parameter->output;
 35 |     bool swizzleOutput = parameter->swizzleOutput;
 36 |     int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width;
 37 |     if (strideOutput < 0)
 38 |     {
 39 |         output = (char*)output - (strideOutput * (height - 1));
 40 |     }
 41 | 
 42 |     u = u ? u : (char*)y + sizeY;
 43 |     v = v ? v : (char*)y + sizeY + sizeU;
 44 | 
 45 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput);
 46 | 
 47 |     if (swizzleOutput)
 48 |     {
 49 |         static auto select = yuv2yuva_select(false, false, 2, 1, 0, 3);
 50 |         converter = select;
 51 |     }
 52 |     else
 53 |     {
 54 |         static auto select = yuv2yuva_select(false, false, 0, 1, 2, 3);
 55 |         converter = select;
 56 |     }
 57 | 
 58 |     converter(width, height, y, u, v, strideY, strideU, strideV, output, strideOutput);
 59 | }
 60 | //------------------------------------------------------------------------------
 61 | void yuv2yuva_yv12(const yuv2yuva_parameter* parameter)
 62 | {
 63 |     int width = parameter->width;
 64 |     int height = parameter->height;
 65 | 
 66 |     const void* y = parameter->y;
 67 |     const void* u = parameter->u;
 68 |     const void* v = parameter->v;
 69 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
 70 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
 71 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
 72 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
 73 |     int strideU = parameter->strideU ? parameter->strideU : align(width, alignWidth) / 2;
 74 |     int strideV = parameter->strideV ? parameter->strideV : align(width, alignWidth) / 2;
 75 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
 76 |     int sizeU = align(strideU * align(height, alignHeight) / 2, alignSize);
 77 | 
 78 |     void* output = parameter->output;
 79 |     bool swizzleOutput = parameter->swizzleOutput;
 80 |     int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width;
 81 |     if (strideOutput < 0)
 82 |     {
 83 |         output = (char*)output - (strideOutput * (height - 1));
 84 |     }
 85 | 
 86 |     u = u ? u : (char*)y + sizeY + sizeU;
 87 |     v = v ? v : (char*)y + sizeY;
 88 | 
 89 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput);
 90 | 
 91 |     if (swizzleOutput)
 92 |     {
 93 |         static auto select = yuv2yuva_select(false, false, 2, 1, 0, 3);
 94 |         converter = select;
 95 |     }
 96 |     else
 97 |     {
 98 |         static auto select = yuv2yuva_select(false, false, 0, 1, 2, 3);
 99 |         converter = select;
100 |     }
101 | 
102 |     converter(width, height, y, u, v, strideY, strideU, strideV, output, strideOutput);
103 | }
104 | //------------------------------------------------------------------------------
105 | void yuv2yuva_nv12(const yuv2yuva_parameter* parameter)
106 | {
107 |     int width = parameter->width;
108 |     int height = parameter->height;
109 | 
110 |     const void* y = parameter->y;
111 |     const void* u = parameter->u;
112 |     const void* v = parameter->v;
113 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
114 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
115 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
116 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
117 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
118 |     int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize);
119 | 
120 |     void* output = parameter->output;
121 |     bool swizzleOutput = parameter->swizzleOutput;
122 |     int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width;
123 |     if (strideOutput < 0)
124 |     {
125 |         output = (char*)output - (strideOutput * (height - 1));
126 |     }
127 | 
128 |     u = u ? u : (char*)y + sizeY;
129 |     v = v ? v : (char*)y + sizeY + 1;
130 | 
131 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput);
132 | 
133 |     if (swizzleOutput)
134 |     {
135 |         static auto select = yuv2yuva_select(true, true, 2, 1, 0, 3);
136 |         converter = select;
137 |     }
138 |     else
139 |     {
140 |         static auto select = yuv2yuva_select(true, true, 0, 1, 2, 3);
141 |         converter = select;
142 |     }
143 | 
144 |     converter(width, height, y, u, v, strideY, strideY, strideY, output, strideOutput);
145 | }
146 | //------------------------------------------------------------------------------
147 | void yuv2yuva_nv21(const yuv2yuva_parameter* parameter)
148 | {
149 |     int width = parameter->width;
150 |     int height = parameter->height;
151 | 
152 |     const void* y = parameter->y;
153 |     const void* u = parameter->u;
154 |     const void* v = parameter->v;
155 |     int alignWidth = parameter->alignWidth ? parameter->alignWidth : 16;
156 |     int alignHeight = parameter->alignHeight ? parameter->alignHeight : 1;
157 |     int alignSize = parameter->alignSize ? parameter->alignSize : 1;
158 |     int strideY = parameter->strideY ? parameter->strideY : align(width, alignWidth);
159 |     int sizeY = align(strideY * align(height, alignHeight), alignSize);
160 |     int sizeUV = align(strideY * align(height, alignHeight) / 2, alignSize);
161 | 
162 |     void* output = parameter->output;
163 |     bool swizzleOutput = parameter->swizzleOutput;
164 |     int strideOutput = parameter->strideOutput ? parameter->strideOutput : 4 * width;
165 |     if (strideOutput < 0)
166 |     {
167 |         output = (char*)output - (strideOutput * (height - 1));
168 |     }
169 | 
170 |     u = u ? u : (char*)y + sizeY + 1;
171 |     v = v ? v : (char*)y + sizeY;
172 | 
173 |     void (*converter)(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput);
174 | 
175 |     if (swizzleOutput)
176 |     {
177 |         static auto select = yuv2yuva_select(true, false, 2, 1, 0, 3);
178 |         converter = select;
179 |     }
180 |     else
181 |     {
182 |         static auto select = yuv2yuva_select(true, false, 0, 1, 2, 3);
183 |         converter = select;
184 |     }
185 | 
186 |     converter(width, height, y, u, v, strideY, strideY, strideY, output, strideOutput);
187 | }
188 | //------------------------------------------------------------------------------
189 | 


--------------------------------------------------------------------------------
/yuv2yuva.h:
--------------------------------------------------------------------------------
 1 | //==============================================================================
 2 | // xxYUV : yuv2yuva Header
 3 | //
 4 | // Copyright (c) 2020-2021 TAiGA
 5 | // https://github.com/metarutaiga/xxYUV
 6 | //==============================================================================
 7 | #pragma once
 8 | 
 9 | #ifndef xxYUV_EXPORT
10 | #define xxYUV_EXPORT
11 | #endif
12 | 
13 | //------------------------------------------------------------------------------
14 | typedef struct _yuv2yuva_parameter
15 | {
16 |     int width;
17 |     int height;
18 | 
19 |     const void* y;
20 |     const void* u;
21 |     const void* v;
22 |     int alignWidth;
23 |     int alignHeight;
24 |     int alignSize;
25 |     int strideY;
26 |     int strideU;
27 |     int strideV;
28 | 
29 |     void* output;
30 |     int strideOutput;
31 |     bool swizzleOutput;
32 | } yuv2yuva_parameter;
33 | //------------------------------------------------------------------------------
34 | xxYUV_EXPORT void yuv2yuva_yu12(const yuv2yuva_parameter* parameter);
35 | xxYUV_EXPORT void yuv2yuva_yv12(const yuv2yuva_parameter* parameter);
36 | xxYUV_EXPORT void yuv2yuva_nv12(const yuv2yuva_parameter* parameter);
37 | xxYUV_EXPORT void yuv2yuva_nv21(const yuv2yuva_parameter* parameter);
38 | //------------------------------------------------------------------------------
39 | 


--------------------------------------------------------------------------------
/yuv2yuva.inl:
--------------------------------------------------------------------------------
  1 | //==============================================================================
  2 | // xxYUV : yuv2yuva Inline
  3 | //
  4 | // Copyright (c) 2020-2021 TAiGA
  5 | // https://github.com/metarutaiga/xxYUV
  6 | //==============================================================================
  7 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
  8 | #   include <arm_neon.h>
  9 | #elif defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
 10 | #   include <immintrin.h>
 11 | #   if defined(__llvm__)
 12 | #       include <avxintrin.h>
 13 | #       include <avx2intrin.h>
 14 | #   endif
 15 | #   define _MM_TRANSPOSE4_EPI8(R0, R1, R2, R3) {    \
 16 |         __m128i T0, T1, T2, T3;                     \
 17 |         T0 = _mm_unpacklo_epi8(R0, R1);             \
 18 |         T1 = _mm_unpacklo_epi8(R2, R3);             \
 19 |         T2 = _mm_unpackhi_epi8(R0, R1);             \
 20 |         T3 = _mm_unpackhi_epi8(R2, R3);             \
 21 |         R0 = _mm_unpacklo_epi16(T0, T1);            \
 22 |         R1 = _mm_unpackhi_epi16(T0, T1);            \
 23 |         R2 = _mm_unpacklo_epi16(T2, T3);            \
 24 |         R3 = _mm_unpackhi_epi16(T2, T3);            \
 25 |     }
 26 | #   define _MM256_TRANSPOSE4_EPI8(R0, R1, R2, R3) { \
 27 |         __m256i T0, T1, T2, T3;                     \
 28 |         T0 = _mm256_unpacklo_epi8(R0, R1);          \
 29 |         T1 = _mm256_unpacklo_epi8(R2, R3);          \
 30 |         T2 = _mm256_unpackhi_epi8(R0, R1);          \
 31 |         T3 = _mm256_unpackhi_epi8(R2, R3);          \
 32 |         R0 = _mm256_unpacklo_epi16(T0, T1);         \
 33 |         R1 = _mm256_unpackhi_epi16(T0, T1);         \
 34 |         R2 = _mm256_unpacklo_epi16(T2, T3);         \
 35 |         R3 = _mm256_unpackhi_epi16(T2, T3);         \
 36 |     }
 37 | #   define _MM256_TRANSPOSE4_SI128(R0, R1, R2, R3) {\
 38 |         __m256i T0, T1, T2, T3;                     \
 39 |         T0 = _mm256_permute2x128_si256(R0, R1, 32); \
 40 |         T1 = _mm256_permute2x128_si256(R0, R1, 49); \
 41 |         T2 = _mm256_permute2x128_si256(R2, R3, 32); \
 42 |         T3 = _mm256_permute2x128_si256(R2, R3, 49); \
 43 |         R0 = T0;                                    \
 44 |         R2 = T1;                                    \
 45 |         R1 = T2;                                    \
 46 |         R3 = T3;                                    \
 47 |     }
 48 | #endif
 49 | 
 50 | //------------------------------------------------------------------------------
 51 | template<bool interleaved, bool firstU, int iY, int iU, int iV, int iA>
 52 | void yuv2yuva(int width, int height, const void* y, const void* u, const void* v, int strideY, int strideU, int strideV, void* output, int strideOutput)
 53 | {
 54 |     int halfWidth = width >> 1;
 55 |     int halfHeight = height >> 1;
 56 | 
 57 |     for (int h = 0; h < halfHeight; ++h)
 58 |     {
 59 |         const unsigned char* y0 = (unsigned char*)y;
 60 |         const unsigned char* y1 = y0 + strideY;             y = y1 + strideY;
 61 |         const unsigned char* u0 = (unsigned char*)u;        u = u0 + strideU;
 62 |         const unsigned char* v0 = (unsigned char*)v;        v = v0 + strideV;
 63 |         unsigned char* output0 = (unsigned char*)output;
 64 |         unsigned char* output1 = output0 + strideOutput;    output = output1 + strideOutput;
 65 | #if HAVE_NEON
 66 |         int halfWidth8 = halfWidth / 8;
 67 |         for (int w = 0; w < halfWidth8; ++w)
 68 |         {
 69 |             uint8x16_t y00 = vld1q_u8(y0); y0 += 16;
 70 |             uint8x16_t y10 = vld1q_u8(y1); y1 += 16;
 71 | 
 72 |             int8x8x2_t u000;
 73 |             int8x8x2_t v000;
 74 |             int8x16_t u00;
 75 |             int8x16_t v00;
 76 |             if (interleaved)
 77 |             {
 78 |                 if (firstU)
 79 |                 {
 80 |                     int8x16_t uv00 = vld1q_u8(u0); u0 += 16;
 81 |                     int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00));
 82 |                     u000 = vzip_s8(uv00lh.val[0], uv00lh.val[0]);
 83 |                     v000 = vzip_s8(uv00lh.val[1], uv00lh.val[1]);
 84 |                 }
 85 |                 else
 86 |                 {
 87 |                     int8x16_t uv00 = vld1q_u8(v0); v0 += 16;
 88 |                     int8x8x2_t uv00lh = vuzp_s8(vget_low_s8(uv00), vget_high_s8(uv00));
 89 |                     u000 = vzip_s8(uv00lh.val[1], uv00lh.val[1]);
 90 |                     v000 = vzip_s8(uv00lh.val[0], uv00lh.val[0]);
 91 |                 }
 92 |             }
 93 |             else
 94 |             {
 95 |                 int8x8_t u0000 = vld1_u8(u0); u0 += 8;
 96 |                 int8x8_t v0000 = vld1_u8(v0); v0 += 8;
 97 |                 u000 = vzip_s8(u0000, u0000);
 98 |                 v000 = vzip_s8(v0000, v0000);
 99 |             }
100 |             u00 = vcombine_s8(u000.val[0], u000.val[1]);
101 |             v00 = vcombine_s8(v000.val[0], v000.val[1]);
102 | 
103 |             uint8x16x4_t t;
104 |             uint8x16x4_t b;
105 | 
106 |             t.val[iY] = y00;
107 |             t.val[iU] = u00;
108 |             t.val[iV] = v00;
109 |             t.val[iA] = vdupq_n_u8(255);
110 |             b.val[iY] = y10;
111 |             b.val[iU] = u00;
112 |             b.val[iV] = v00;
113 |             b.val[iA] = vdupq_n_u8(255);
114 | 
115 |             vst4q_u8(output0, t); output0 += 16 * 4;
116 |             vst4q_u8(output1, b); output1 += 16 * 4;
117 |         }
118 |         continue;
119 | #elif HAVE_AVX2
120 |         int halfWidth16 = halfWidth / 16;
121 |         for (int w = 0; w < halfWidth16; ++w)
122 |         {
123 |             __m256i y00 = _mm256_loadu_si256((__m256i*)y0); y0 += 32;
124 |             __m256i y10 = _mm256_loadu_si256((__m256i*)y1); y1 += 32;
125 | 
126 |             __m256i u00;
127 |             __m256i v00;
128 |             if (interleaved)
129 |             {
130 |                 if (firstU)
131 |                 {
132 |                     __m256i uv00 = _mm256_loadu_si256((__m256i*)u0); u0 += 32;
133 |                     u00 = _mm256_and_si256(uv00, _mm256_set1_epi16(0xFF));
134 |                     v00 = _mm256_srli_epi16(uv00, 8);
135 |                 }
136 |                 else
137 |                 {
138 |                     __m256i uv00 = _mm256_loadu_si256((__m256i*)v0); v0 += 32;
139 |                     u00 = _mm256_srli_epi16(uv00, 8);
140 |                     v00 = _mm256_and_si256(uv00, _mm256_set1_epi16(0xFF));
141 |                 }
142 |                 u00 = _mm256_packus_epi16(u00, u00);
143 |                 v00 = _mm256_packus_epi16(v00, v00);
144 |             }
145 |             else
146 |             {
147 |                 __m128i u000 = _mm_loadu_si128((__m128i*)u0); u0 += 16;
148 |                 __m128i v000 = _mm_loadu_si128((__m128i*)v0); v0 += 16;
149 |                 u00 = _mm256_castsi128_si256(u000);
150 |                 v00 = _mm256_castsi128_si256(v000);
151 |             }
152 |             u00 = _mm256_unpacklo_epi8(u00, u00);
153 |             v00 = _mm256_unpacklo_epi8(v00, v00);
154 | 
155 |             __m256i t[4];
156 |             __m256i b[4];
157 | 
158 |             t[iY] = y00;
159 |             t[iU] = u00;
160 |             t[iV] = v00;
161 |             t[iA] = _mm256_set1_epi8(-1);
162 |             b[iY] = y10;
163 |             b[iU] = u00;
164 |             b[iV] = v00;
165 |             b[iA] = _mm256_set1_epi8(-1);
166 | 
167 |             _MM256_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]);
168 |             _MM256_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]);
169 |             _MM256_TRANSPOSE4_SI128(t[0], t[1], t[2], t[3]);
170 |             _MM256_TRANSPOSE4_SI128(b[0], b[1], b[2], b[3]);
171 | 
172 |             _mm256_storeu_si256((__m256i*)output0 + 0, t[0]);
173 |             _mm256_storeu_si256((__m256i*)output0 + 1, t[1]);
174 |             _mm256_storeu_si256((__m256i*)output0 + 2, t[2]);
175 |             _mm256_storeu_si256((__m256i*)output0 + 3, t[3]);
176 |             _mm256_storeu_si256((__m256i*)output1 + 0, b[0]); output0 += 16 * 8;
177 |             _mm256_storeu_si256((__m256i*)output1 + 1, b[1]);
178 |             _mm256_storeu_si256((__m256i*)output1 + 2, b[2]);
179 |             _mm256_storeu_si256((__m256i*)output1 + 3, b[3]); output1 += 16 * 8;
180 |         }
181 |         continue;
182 | #elif HAVE_SSE2
183 |         int halfWidth8 = halfWidth / 8;
184 |         for (int w = 0; w < halfWidth8; ++w)
185 |         {
186 |             __m128i y00 = _mm_loadu_si128((__m128i*)y0); y0 += 16;
187 |             __m128i y10 = _mm_loadu_si128((__m128i*)y1); y1 += 16;
188 | 
189 |             __m128i u00;
190 |             __m128i v00;
191 |             if (interleaved)
192 |             {
193 |                 if (firstU)
194 |                 {
195 |                     __m128i uv00 = _mm_loadu_si128((__m128i*)u0); u0 += 16;
196 |                     u00 = _mm_and_si128(uv00, _mm_set1_epi16(0xFF));
197 |                     v00 = _mm_srli_epi16(uv00, 8);
198 |                 }
199 |                 else
200 |                 {
201 |                     __m128i uv00 = _mm_loadu_si128((__m128i*)v0); v0 += 16;
202 |                     u00 = _mm_srli_epi16(uv00, 8);
203 |                     v00 = _mm_and_si128(uv00, _mm_set1_epi16(0xFF));
204 |                 }
205 |                 u00 = _mm_packus_epi16(u00, u00);
206 |                 v00 = _mm_packus_epi16(v00, v00);
207 |             }
208 |             else
209 |             {
210 |                 u00 = _mm_loadl_epi64((__m128i*)u0); u0 += 8;
211 |                 v00 = _mm_loadl_epi64((__m128i*)v0); v0 += 8;
212 |             }
213 |             u00 = _mm_unpacklo_epi8(u00, u00);
214 |             v00 = _mm_unpacklo_epi8(v00, v00);
215 | 
216 |             __m128i t[4];
217 |             __m128i b[4];
218 | 
219 |             t[iY] = y00;
220 |             t[iU] = u00;
221 |             t[iV] = v00;
222 |             t[iA] = _mm_set1_epi8(-1);
223 |             b[iY] = y10;
224 |             b[iU] = u00;
225 |             b[iV] = v00;
226 |             b[iA] = _mm_set1_epi8(-1);
227 | 
228 |             _MM_TRANSPOSE4_EPI8(t[0], t[1], t[2], t[3]);
229 |             _MM_TRANSPOSE4_EPI8(b[0], b[1], b[2], b[3]);
230 | 
231 |             _mm_storeu_si128((__m128i*)output0 + 0, t[0]);
232 |             _mm_storeu_si128((__m128i*)output0 + 1, t[1]);
233 |             _mm_storeu_si128((__m128i*)output0 + 2, t[2]);
234 |             _mm_storeu_si128((__m128i*)output0 + 3, t[3]); output0 += 16 * 4;
235 |             _mm_storeu_si128((__m128i*)output1 + 0, b[0]);
236 |             _mm_storeu_si128((__m128i*)output1 + 1, b[1]);
237 |             _mm_storeu_si128((__m128i*)output1 + 2, b[2]);
238 |             _mm_storeu_si128((__m128i*)output1 + 3, b[3]); output1 += 16 * 4;
239 |         }
240 |         continue;
241 | #endif
242 |         for (int w = 0; w < halfWidth; ++w)
243 |         {
244 |             auto y00 = (*y0++);
245 |             auto y01 = (*y0++);
246 |             auto y10 = (*y1++);
247 |             auto y11 = (*y1++);
248 | 
249 |             auto u00 = (*u0++);
250 |             auto v00 = (*v0++);
251 |             if (interleaved)
252 |             {
253 |                 u0++;
254 |                 v0++;
255 |             }
256 | 
257 |             output0[iY] = y00;
258 |             output0[iU] = u00;
259 |             output0[iV] = v00;
260 |             output0[iA] = 255;
261 |             output0 += 4;
262 | 
263 |             output0[iY] = y01;
264 |             output0[iU] = u00;
265 |             output0[iV] = v00;
266 |             output0[iA] = 255;
267 |             output0 += 4;
268 | 
269 |             output1[iY] = y10;
270 |             output1[iU] = u00;
271 |             output1[iV] = v00;
272 |             output1[iA] = 255;
273 |             output1 += 4;
274 | 
275 |             output1[iY] = y11;
276 |             output1[iU] = u00;
277 |             output1[iV] = v00;
278 |             output1[iA] = 255;
279 |             output1 += 4;
280 |         }
281 |     }
282 | }
283 | //------------------------------------------------------------------------------
284 | #ifndef yuv2yuva_select
285 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \
286 |     yuv2yuva<interleaved, firstU, iY, iU, iV, iA>
287 | #endif
288 | //------------------------------------------------------------------------------
289 | #ifndef yuv2yuva
290 | //------------------------------------------------------------------------------
291 | #if defined(__llvm__)
292 | #define yuv2yuva_attribute(value) __attribute__((target(value)))
293 | #else
294 | #define yuv2yuva_attribute(value)
295 | #endif
296 | //------------------------------------------------------------------------------
297 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64)
298 | #define HAVE_NEON 1
299 | #define yuv2yuva yuv2yuva_attribute("neon") yuv2yuva_neon
300 | #include "yuv2yuva.inl"
301 | #undef yuv2yuva
302 | #undef HAVE_NEON
303 | #undef yuv2yuva_select
304 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \
305 |     neon() ? yuv2yuva_neon<interleaved, firstU, iY, iU, iV, iA> : \
306 |     yuv2yuva<interleaved, firstU, iY, iU, iV, iA>
307 | #endif
308 | //------------------------------------------------------------------------------
309 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
310 | #define HAVE_SSE2 1
311 | #define yuv2yuva yuv2yuva_attribute("sse2") yuv2yuva_sse2
312 | #include "yuv2yuva.inl"
313 | #undef yuv2yuva
314 | #undef HAVE_SSE2
315 | #undef yuv2yuva_select
316 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \
317 |     sse2() ? yuv2yuva_sse2<interleaved, firstU, iY, iU, iV, iA> : \
318 |     yuv2yuva<interleaved, firstU, iY, iU, iV, iAe>
319 | #endif
320 | //------------------------------------------------------------------------------
321 | #if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386__) || defined(__amd64__)
322 | #define HAVE_AVX2 1
323 | #define yuv2yuva yuv2yuva_attribute("avx2") yuv2yuva_avx2
324 | #include "yuv2yuva.inl"
325 | #undef yuv2yuva
326 | #undef HAVE_AVX2
327 | #undef yuv2yuva_select
328 | #define yuv2yuva_select(interleaved, firstU, iY, iU, iV, iA) \
329 |     avx2() ? yuv2yuva_avx2<interleaved, firstU, iY, iU, iV, iA> : \
330 |     sse2() ? yuv2yuva_sse2<interleaved, firstU, iY, iU, iV, iA> : \
331 |     yuv2yuva<interleaved, firstU, iY, iU, iV, iA>
332 | #endif
333 | //------------------------------------------------------------------------------
334 | #endif
335 | //------------------------------------------------------------------------------
336 | 


--------------------------------------------------------------------------------