├── RawIntrinsics ├── Utils.ManuallyAdded.cs ├── RawIntrinsics.csproj ├── MMX.ManuallyAdded.cs ├── SSE.ManuallyAdded.cs ├── SSE42.cs ├── SSE2.ManuallyAdded.cs ├── AVX.ManuallyAdded.cs ├── MMX.cs ├── SSE3.cs ├── Other.cs ├── SSSE3.cs ├── Types.cs ├── FMA.cs ├── SSE41.cs └── SSE.cs ├── RawIntrinsicsGenerator ├── RawIntrinsicsGenerator.csproj ├── Program.cs └── Generator.cs ├── README.md ├── Wibic.sln └── .gitignore /RawIntrinsics/Utils.ManuallyAdded.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static class Utils 4 | { 5 | public static int _MM_SHUFFLE(int z, int y, int x, int w) => (z << 6) | (y << 4) | (x << 2) | w; 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /RawIntrinsics/RawIntrinsics.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net5.0 5 | true 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /RawIntrinsics/MMX.ManuallyAdded.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static partial class MMX 4 | { 5 | /// 6 | /// Return vector of type __m64 with all elements set to zero. 7 | /// 8 | /// __m64 dst {FP32} 9 | public static __m64 _mm_setzero_si64() => System.Runtime.Intrinsics.Vector64.Zero; 10 | } 11 | } -------------------------------------------------------------------------------- /RawIntrinsics/SSE.ManuallyAdded.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static partial class SSE 4 | { 5 | /// 6 | /// Return vector of type __m128 with all elements set to zero. 7 | /// 8 | /// __m128 dst {FP32} 9 | public static __m128 _mm_setzero_ps() => System.Runtime.Intrinsics.Vector128.Zero; 10 | } 11 | } -------------------------------------------------------------------------------- /RawIntrinsicsGenerator/RawIntrinsicsGenerator.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net5.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /RawIntrinsicsGenerator/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Reflection; 4 | using System.Threading.Tasks; 5 | 6 | namespace RawIntrinsicsGenerator 7 | { 8 | public static class Program 9 | { 10 | private async static Task Main(string[] _) 11 | { 12 | var savePath = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "RawIntrinsics"); 13 | 14 | await Generator.Generate("RawIntrinsics", savePath); 15 | 16 | Console.WriteLine($"Done! Generated files were saved to {savePath}"); 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /RawIntrinsics/SSE42.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class SSE42 4 | { 5 | /// 6 | /// Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". 7 | /// 8 | /// PCMPGTQ xmm, xmm 9 | /// __m128i {SI64} 10 | /// __m128i {SI64} 11 | /// __m128i dst {UI64} 12 | public static __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse42.CompareGreaterThan(a.SI64, b.SI64); 13 | 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /RawIntrinsics/SSE2.ManuallyAdded.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static partial class SSE2 4 | { 5 | /// 6 | /// Return vector of type __m128d with all elements set to zero. 7 | /// 8 | /// __m128d dst {M128} 9 | public static __m128d _mm_setzero_pd() => System.Runtime.Intrinsics.Vector128.Zero; 10 | 11 | /// 12 | /// Return vector of type __m128i with all elements set to zero. 13 | /// 14 | /// __m128i dst {M128} 15 | public static __m128i _mm_setzero_si128() => System.Runtime.Intrinsics.Vector128.Zero; 16 | } 17 | } -------------------------------------------------------------------------------- /RawIntrinsics/AVX.ManuallyAdded.cs: -------------------------------------------------------------------------------- 1 | using System.Runtime.Intrinsics; 2 | 3 | namespace RawIntrinsics 4 | { 5 | public static partial class AVX 6 | { 7 | /// 8 | /// Return vector of type __m256d with all elements set to zero. 9 | /// 10 | /// __m256d dst {FP64} 11 | public static __m256d _mm256_setzero_pd() => System.Runtime.Intrinsics.Vector256.Zero; 12 | 13 | /// 14 | /// Return vector of type __m256 with all elements set to zero. 15 | /// 16 | /// __m256 dst {FP32} 17 | public static __m256 _mm256_setzero_ps() => System.Runtime.Intrinsics.Vector256.Zero; 18 | 19 | /// 20 | /// Return vector of type __m256i with all elements set to zero. 21 | /// 22 | /// __m256i dst {M256} 23 | public static __m256i _mm256_setzero_si256() => System.Runtime.Intrinsics.Vector256.Zero; 24 | } 25 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wibic.RawIntrinsics 2 | 3 | .NET intrinsics represented by methods named after native intrinsics functions. 4 | Those methods can make it a lot easier to port existing SIMD related C++ code into C# (I hope). 5 | 6 | Something like this: 7 | 8 | ```csharp 9 | var v = _mm256_set1_epi8(1); 10 | 11 | var end = data + size; 12 | var ptr = data; 13 | 14 | __m256i tmp; 15 | __m256i global_sum = _mm256_setzero_si256(); 16 | __m256i local_sum; 17 | 18 | while (ptr + 255 * 32 < end) 19 | { 20 | local_sum = _mm256_setzero_si256(); 21 | 22 | for (var i = 0; i < 255; i++, ptr += 32) 23 | { 24 | __m256i src = _mm256_loadu_si256((__m256i*)ptr); 25 | __m256i eq = _mm256_cmpeq_epi8(src, v); 26 | 27 | local_sum = _mm256_sub_epi8(local_sum, eq); 28 | } 29 | 30 | tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256()); 31 | global_sum = _mm256_add_epi64(global_sum, tmp); 32 | } 33 | ``` 34 | 35 | All methods generated by parsing and using data from these two sources: 36 | 37 | https://github.com/dotnet/runtime/tree/master/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics 38 | 39 | and: 40 | 41 | https://software.intel.com/sites/landingpage/IntrinsicsGuide/files/data-latest.xml 42 | 43 | PS: Generator project also included in the repo. 44 | -------------------------------------------------------------------------------- /Wibic.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RawIntrinsics", "RawIntrinsics\RawIntrinsics.csproj", "{2C8F57F8-6560-42F3-A24C-C649FA350F72}" 4 | EndProject 5 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RawIntrinsicsGenerator", "RawIntrinsicsGenerator\RawIntrinsicsGenerator.csproj", "{A701F8FF-AA1F-4F24-ADC7-7DFB7D0E7EDB}" 6 | EndProject 7 | Global 8 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 9 | Debug|Any CPU = Debug|Any CPU 10 | Release|Any CPU = Release|Any CPU 11 | EndGlobalSection 12 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 13 | {2C8F57F8-6560-42F3-A24C-C649FA350F72}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 14 | {2C8F57F8-6560-42F3-A24C-C649FA350F72}.Debug|Any CPU.Build.0 = Debug|Any CPU 15 | {2C8F57F8-6560-42F3-A24C-C649FA350F72}.Release|Any CPU.ActiveCfg = Release|Any CPU 16 | {2C8F57F8-6560-42F3-A24C-C649FA350F72}.Release|Any CPU.Build.0 = Release|Any CPU 17 | {A701F8FF-AA1F-4F24-ADC7-7DFB7D0E7EDB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 18 | {A701F8FF-AA1F-4F24-ADC7-7DFB7D0E7EDB}.Debug|Any CPU.Build.0 = Debug|Any CPU 19 | {A701F8FF-AA1F-4F24-ADC7-7DFB7D0E7EDB}.Release|Any CPU.ActiveCfg = Release|Any CPU 20 | {A701F8FF-AA1F-4F24-ADC7-7DFB7D0E7EDB}.Release|Any CPU.Build.0 = Release|Any CPU 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /RawIntrinsics/MMX.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class MMX 4 | { 5 | /// 6 | /// Broadcast 16-bit integer "a" to all all elements of "dst". 7 | /// 8 | /// 9 | /// short {UI16} 10 | /// __m64 dst {FP32} 11 | public static __m64 _mm_set1_pi16(short a) => System.Runtime.Intrinsics.Vector64.Create((ushort)a); 12 | 13 | /// 14 | /// Broadcast 32-bit integer "a" to all elements of "dst". 15 | /// 16 | /// 17 | /// int {UI32} 18 | /// __m64 dst {FP32} 19 | public static __m64 _mm_set1_pi32(int a) => System.Runtime.Intrinsics.Vector64.Create((uint)a); 20 | 21 | /// 22 | /// Broadcast 8-bit integer "a" to all elements of "dst". 23 | /// 24 | /// 25 | /// byte {UI8} 26 | /// __m64 dst {FP32} 27 | public static __m64 _mm_set1_pi8(byte a) => System.Runtime.Intrinsics.Vector64.Create(a); 28 | 29 | /// 30 | /// Set packed 16-bit integers in "dst" with the supplied values in reverse order. 31 | /// 32 | /// 33 | /// short {UI16} 34 | /// short {UI16} 35 | /// short {UI16} 36 | /// short {UI16} 37 | /// __m64 dst {FP32} 38 | public static __m64 _mm_setr_pi16(short e3, short e2, short e1, short e0) => System.Runtime.Intrinsics.Vector64.Create((ushort)e3, (ushort)e2, (ushort)e1, (ushort)e0); 39 | 40 | /// 41 | /// Set packed 32-bit integers in "dst" with the supplied values in reverse order. 42 | /// 43 | /// 44 | /// int {UI32} 45 | /// int {UI32} 46 | /// __m64 dst {FP32} 47 | public static __m64 _mm_setr_pi32(int e1, int e0) => System.Runtime.Intrinsics.Vector64.Create((uint)e1, (uint)e0); 48 | 49 | /// 50 | /// Set packed 8-bit integers in "dst" with the supplied values in reverse order. 51 | /// 52 | /// 53 | /// byte {UI8} 54 | /// byte {UI8} 55 | /// byte {UI8} 56 | /// byte {UI8} 57 | /// byte {UI8} 58 | /// byte {UI8} 59 | /// byte {UI8} 60 | /// byte {UI8} 61 | /// __m64 dst {FP32} 62 | public static __m64 _mm_setr_pi8(byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0) => System.Runtime.Intrinsics.Vector64.Create(e7, e6, e5, e4, e3, e2, e1, e0); 63 | 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /RawIntrinsics/SSE3.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class SSE3 4 | { 5 | /// 6 | /// Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". 7 | /// 8 | /// ADDSUBPD xmm, xmm 9 | /// __m128d {FP64} 10 | /// __m128d {FP64} 11 | /// __m128d dst {FP64} 12 | public static __m128d _mm_addsub_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse3.AddSubtract(a.FP64, b.FP64); 13 | 14 | /// 15 | /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". 16 | /// 17 | /// ADDSUBPS xmm, xmm 18 | /// __m128 {FP32} 19 | /// __m128 {FP32} 20 | /// __m128 dst {FP32} 21 | public static __m128 _mm_addsub_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse3.AddSubtract(a.FP32, b.FP32); 22 | 23 | /// 24 | /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". 25 | /// 26 | /// HADDPD xmm, xmm 27 | /// __m128d {FP64} 28 | /// __m128d {FP64} 29 | /// __m128d dst {FP64} 30 | public static __m128d _mm_hadd_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalAdd(a.FP64, b.FP64); 31 | 32 | /// 33 | /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". 34 | /// 35 | /// HADDPS xmm, xmm 36 | /// __m128 {FP32} 37 | /// __m128 {FP32} 38 | /// __m128 dst {FP32} 39 | public static __m128 _mm_hadd_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalAdd(a.FP32, b.FP32); 40 | 41 | /// 42 | /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". 43 | /// 44 | /// HSUBPD xmm, xmm 45 | /// __m128d {FP64} 46 | /// __m128d {FP64} 47 | /// __m128d dst {FP64} 48 | public static __m128d _mm_hsub_pd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalSubtract(a.FP64, b.FP64); 49 | 50 | /// 51 | /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". 52 | /// 53 | /// HSUBPS xmm, xmm 54 | /// __m128 {FP32} 55 | /// __m128 {FP32} 56 | /// __m128 dst {FP32} 57 | public static __m128 _mm_hsub_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse3.HorizontalSubtract(a.FP32, b.FP32); 58 | 59 | /// 60 | /// Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary. 61 | /// 62 | /// LDDQU xmm, m128 63 | /// __m128i {M128} 64 | /// __m128i dst {M128} 65 | public static __m128i _mm_lddqu_si128(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse3.LoadDquVector128((sbyte*)mem_addr); 66 | 67 | /// 68 | /// Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". 69 | /// 70 | /// MOVDDUP xmm, m64 71 | /// double {FP64} 72 | /// __m128d dst {FP64} 73 | public static __m128d _mm_loaddup_pd(double* mem_addr) => System.Runtime.Intrinsics.X86.Sse3.LoadAndDuplicateToVector128(mem_addr); 74 | 75 | /// 76 | /// Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". 77 | /// 78 | /// MOVDDUP xmm, xmm 79 | /// __m128d {FP64} 80 | /// __m128d dst {FP64} 81 | public static __m128d _mm_movedup_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse3.MoveAndDuplicate(a.FP64); 82 | 83 | /// 84 | /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". 85 | /// 86 | /// MOVSHDUP xmm, xmm 87 | /// __m128 {FP32} 88 | /// __m128 dst {FP32} 89 | public static __m128 _mm_movehdup_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse3.MoveHighAndDuplicate(a.FP32); 90 | 91 | /// 92 | /// Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". 93 | /// 94 | /// MOVSLDUP xmm, xmm 95 | /// __m128 {FP32} 96 | /// __m128 dst {FP32} 97 | public static __m128 _mm_moveldup_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse3.MoveLowAndDuplicate(a.FP32); 98 | 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /RawIntrinsics/Other.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class Other 4 | { 5 | /// 6 | /// Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". 7 | /// 8 | /// AESDEC xmm, xmm 9 | /// __m128i {M128} 10 | /// __m128i {M128} 11 | /// __m128i dst {M128} 12 | public static __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.Decrypt(a.UI8, RoundKey.UI8); 13 | 14 | /// 15 | /// Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". 16 | /// 17 | /// AESDECLAST xmm, xmm 18 | /// __m128i {M128} 19 | /// __m128i {M128} 20 | /// __m128i dst {M128} 21 | public static __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.DecryptLast(a.UI8, RoundKey.UI8); 22 | 23 | /// 24 | /// Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." 25 | /// 26 | /// AESENC xmm, xmm 27 | /// __m128i {M128} 28 | /// __m128i {M128} 29 | /// __m128i dst {M128} 30 | public static __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.Encrypt(a.UI8, RoundKey.UI8); 31 | 32 | /// 33 | /// Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." 34 | /// 35 | /// AESENCLAST xmm, xmm 36 | /// __m128i {M128} 37 | /// __m128i {M128} 38 | /// __m128i dst {M128} 39 | public static __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) => System.Runtime.Intrinsics.X86.Aes.EncryptLast(a.UI8, RoundKey.UI8); 40 | 41 | /// 42 | /// Perform the InvMixColumns transformation on "a" and store the result in "dst". 43 | /// 44 | /// AESIMC xmm, xmm 45 | /// __m128i {M128} 46 | /// __m128i dst {M128} 47 | public static __m128i _mm_aesimc_si128(__m128i a) => System.Runtime.Intrinsics.X86.Aes.InverseMixColumns(a.UI8); 48 | 49 | /// 50 | /// Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"." 51 | /// 52 | /// AESKEYGENASSIST xmm, xmm, imm8 53 | /// __m128i {M128} 54 | /// int {IMM} 55 | /// __m128i dst {M128} 56 | public static __m128i _mm_aeskeygenassist_si128(__m128i a, int imm8) => System.Runtime.Intrinsics.X86.Aes.KeygenAssist(a.UI8, (byte)imm8); 57 | 58 | /// 59 | /// Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst". 60 | /// 61 | /// PCLMULQDQ xmm, xmm, imm8 62 | /// __m128i {M128} 63 | /// __m128i {M128} 64 | /// int {IMM} 65 | /// __m128i dst {M128} 66 | public static __m128i _mm_clmulepi64_si128(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Pclmulqdq.CarrylessMultiply(a.SI64, b.SI64, (byte)imm8); 67 | 68 | /// 69 | /// Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". 70 | /// 71 | /// POPCNT r32, r32 72 | /// int {UI32} 73 | /// int dst {UI32} 74 | public static int _mm_popcnt_u32(int a) => (int)System.Runtime.Intrinsics.X86.Popcnt.PopCount((uint)a); 75 | 76 | /// 77 | /// Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". 78 | /// 79 | /// POPCNT r64, r64 80 | /// long {UI64} 81 | /// long dst {UI64} 82 | public static long _mm_popcnt_u64(long a) => (long)System.Runtime.Intrinsics.X86.Popcnt.X64.PopCount((ulong)a); 83 | 84 | /// 85 | /// Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". 86 | /// 87 | /// TZCNT r32, r32 88 | /// int {UI32} 89 | /// int dst {UI32} 90 | public static int _mm_tzcnt_32(int a) => (int)System.Runtime.Intrinsics.X86.Bmi1.TrailingZeroCount((uint)a); 91 | 92 | /// 93 | /// Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". 94 | /// 95 | /// TZCNT r64, r64 96 | /// long {UI64} 97 | /// long dst {UI64} 98 | public static long _mm_tzcnt_64(long a) => (long)System.Runtime.Intrinsics.X86.Bmi1.X64.TrailingZeroCount((ulong)a); 99 | 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | *.DotSettings.user 13 | 14 | # User-specific files (MonoDevelop/Xamarin Studio) 15 | *.userprefs 16 | 17 | # Mono auto generated files 18 | mono_crash.* 19 | 20 | # Build results 21 | [Dd]ebug/ 22 | [Dd]ebugPublic/ 23 | [Rr]elease/ 24 | [Rr]eleases/ 25 | x64/ 26 | x86/ 27 | [Ww][Ii][Nn]32/ 28 | [Aa][Rr][Mm]/ 29 | [Aa][Rr][Mm]64/ 30 | bld/ 31 | [Bb]in/ 32 | [Oo]bj/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | .idea/ 36 | 37 | # Visual Studio 2015/2017 cache/options directory 38 | .vs/ 39 | # Uncomment if you have tasks that create the project's static files in wwwroot 40 | #wwwroot/ 41 | 42 | # Visual Studio 2017 auto generated files 43 | Generated\ Files/ 44 | 45 | # MSTest test Results 46 | [Tt]est[Rr]esult*/ 47 | [Bb]uild[Ll]og.* 48 | 49 | # NUnit 50 | *.VisualState.xml 51 | TestResult.xml 52 | nunit-*.xml 53 | 54 | # Build Results of an ATL Project 55 | [Dd]ebugPS/ 56 | [Rr]eleasePS/ 57 | dlldata.c 58 | 59 | # Benchmark Results 60 | BenchmarkDotNet.Artifacts/ 61 | 62 | # .NET Core 63 | project.lock.json 64 | project.fragment.lock.json 65 | artifacts/ 66 | 67 | # ASP.NET Scaffolding 68 | ScaffoldingReadMe.txt 69 | 70 | # StyleCop 71 | StyleCopReport.xml 72 | 73 | # Files built by Visual Studio 74 | *_i.c 75 | *_p.c 76 | *_h.h 77 | *.ilk 78 | *.meta 79 | *.obj 80 | *.iobj 81 | *.pch 82 | *.pdb 83 | *.ipdb 84 | *.pgc 85 | *.pgd 86 | *.rsp 87 | *.sbr 88 | *.tlb 89 | *.tli 90 | *.tlh 91 | *.tmp 92 | *.tmp_proj 93 | *_wpftmp.csproj 94 | *.log 95 | *.vspscc 96 | *.vssscc 97 | .builds 98 | *.pidb 99 | *.svclog 100 | *.scc 101 | 102 | # Chutzpah Test files 103 | _Chutzpah* 104 | 105 | # Visual C++ cache files 106 | ipch/ 107 | *.aps 108 | *.ncb 109 | *.opendb 110 | *.opensdf 111 | *.sdf 112 | *.cachefile 113 | *.VC.db 114 | *.VC.VC.opendb 115 | 116 | # Visual Studio profiler 117 | *.psess 118 | *.vsp 119 | *.vspx 120 | *.sap 121 | 122 | # Visual Studio Trace Files 123 | *.e2e 124 | 125 | # TFS 2012 Local Workspace 126 | $tf/ 127 | 128 | # Guidance Automation Toolkit 129 | *.gpState 130 | 131 | # ReSharper is a .NET coding add-in 132 | _ReSharper*/ 133 | *.[Rr]e[Ss]harper 134 | *.DotSettings.user 135 | 136 | # TeamCity is a build add-in 137 | _TeamCity* 138 | 139 | # DotCover is a Code Coverage Tool 140 | *.dotCover 141 | 142 | # AxoCover is a Code Coverage Tool 143 | .axoCover/* 144 | !.axoCover/settings.json 145 | 146 | # Coverlet is a free, cross platform Code Coverage Tool 147 | coverage*.json 148 | coverage*.xml 149 | coverage*.info 150 | 151 | # Visual Studio code coverage results 152 | *.coverage 153 | *.coveragexml 154 | 155 | # NCrunch 156 | _NCrunch_* 157 | .*crunch*.local.xml 158 | nCrunchTemp_* 159 | 160 | # MightyMoose 161 | *.mm.* 162 | AutoTest.Net/ 163 | 164 | # Web workbench (sass) 165 | .sass-cache/ 166 | 167 | # Installshield output folder 168 | [Ee]xpress/ 169 | 170 | # DocProject is a documentation generator add-in 171 | DocProject/buildhelp/ 172 | DocProject/Help/*.HxT 173 | DocProject/Help/*.HxC 174 | DocProject/Help/*.hhc 175 | DocProject/Help/*.hhk 176 | DocProject/Help/*.hhp 177 | DocProject/Help/Html2 178 | DocProject/Help/html 179 | 180 | # Click-Once directory 181 | publish/ 182 | 183 | # Publish Web Output 184 | *.[Pp]ublish.xml 185 | *.azurePubxml 186 | # Note: Comment the next line if you want to checkin your web deploy settings, 187 | # but database connection strings (with potential passwords) will be unencrypted 188 | *.pubxml 189 | *.publishproj 190 | 191 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 192 | # checkin your Azure Web App publish settings, but sensitive information contained 193 | # in these scripts will be unencrypted 194 | PublishScripts/ 195 | 196 | # NuGet Packages 197 | *.nupkg 198 | # NuGet Symbol Packages 199 | *.snupkg 200 | # The packages folder can be ignored because of Package Restore 201 | **/[Pp]ackages/* 202 | # except build/, which is used as an MSBuild target. 203 | !**/[Pp]ackages/build/ 204 | # Uncomment if necessary however generally it will be regenerated when needed 205 | #!**/[Pp]ackages/repositories.config 206 | # NuGet v3's project.json files produces more ignorable files 207 | *.nuget.props 208 | *.nuget.targets 209 | 210 | # Microsoft Azure Build Output 211 | csx/ 212 | *.build.csdef 213 | 214 | # Microsoft Azure Emulator 215 | ecf/ 216 | rcf/ 217 | 218 | # Windows Store app package directories and files 219 | AppPackages/ 220 | BundleArtifacts/ 221 | Package.StoreAssociation.xml 222 | _pkginfo.txt 223 | *.appx 224 | *.appxbundle 225 | *.appxupload 226 | 227 | # Visual Studio cache files 228 | # files ending in .cache can be ignored 229 | *.[Cc]ache 230 | # but keep track of directories ending in .cache 231 | !?*.[Cc]ache/ 232 | 233 | # Others 234 | ClientBin/ 235 | ~$* 236 | *~ 237 | *.dbmdl 238 | *.dbproj.schemaview 239 | *.jfm 240 | *.pfx 241 | *.publishsettings 242 | orleans.codegen.cs 243 | 244 | # Including strong name files can present a security risk 245 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 246 | #*.snk 247 | 248 | # Since there are multiple workflows, uncomment next line to ignore bower_components 249 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 250 | #bower_components/ 251 | 252 | # RIA/Silverlight projects 253 | Generated_Code/ 254 | 255 | # Backup & report files from converting an old project file 256 | # to a newer Visual Studio version. Backup files are not needed, 257 | # because we have git ;-) 258 | _UpgradeReport_Files/ 259 | Backup*/ 260 | UpgradeLog*.XML 261 | UpgradeLog*.htm 262 | ServiceFabricBackup/ 263 | *.rptproj.bak 264 | 265 | # SQL Server files 266 | *.mdf 267 | *.ldf 268 | *.ndf 269 | 270 | # Business Intelligence projects 271 | *.rdl.data 272 | *.bim.layout 273 | *.bim_*.settings 274 | *.rptproj.rsuser 275 | *- [Bb]ackup.rdl 276 | *- [Bb]ackup ([0-9]).rdl 277 | *- [Bb]ackup ([0-9][0-9]).rdl 278 | 279 | # Microsoft Fakes 280 | FakesAssemblies/ 281 | 282 | # GhostDoc plugin setting file 283 | *.GhostDoc.xml 284 | 285 | # Node.js Tools for Visual Studio 286 | .ntvs_analysis.dat 287 | node_modules/ 288 | 289 | # Visual Studio 6 build log 290 | *.plg 291 | 292 | # Visual Studio 6 workspace options file 293 | *.opt 294 | 295 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 296 | *.vbw 297 | 298 | # Visual Studio LightSwitch build output 299 | **/*.HTMLClient/GeneratedArtifacts 300 | **/*.DesktopClient/GeneratedArtifacts 301 | **/*.DesktopClient/ModelManifest.xml 302 | **/*.Server/GeneratedArtifacts 303 | **/*.Server/ModelManifest.xml 304 | _Pvt_Extensions 305 | 306 | # Paket dependency manager 307 | .paket/paket.exe 308 | paket-files/ 309 | 310 | # FAKE - F# Make 311 | .fake/ 312 | 313 | # CodeRush personal settings 314 | .cr/personal 315 | 316 | # Python Tools for Visual Studio (PTVS) 317 | __pycache__/ 318 | *.pyc 319 | 320 | # Cake - Uncomment if you are using it 321 | # tools/** 322 | # !tools/packages.config 323 | 324 | # Tabs Studio 325 | *.tss 326 | 327 | # Telerik's JustMock configuration file 328 | *.jmconfig 329 | 330 | # BizTalk build output 331 | *.btp.cs 332 | *.btm.cs 333 | *.odx.cs 334 | *.xsd.cs 335 | 336 | # OpenCover UI analysis results 337 | OpenCover/ 338 | 339 | # Azure Stream Analytics local run output 340 | ASALocalRun/ 341 | 342 | # MSBuild Binary and Structured Log 343 | *.binlog 344 | 345 | # NVidia Nsight GPU debugger configuration file 346 | *.nvuser 347 | 348 | # MFractors (Xamarin productivity tool) working folder 349 | .mfractor/ 350 | 351 | # Local History for Visual Studio 352 | .localhistory/ 353 | 354 | # BeatPulse healthcheck temp database 355 | healthchecksdb 356 | 357 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 358 | MigrationBackup/ 359 | 360 | # Ionide (cross platform F# VS Code tools) working folder 361 | .ionide/ 362 | 363 | # Fody - auto-generated XML schema 364 | FodyWeavers.xsd -------------------------------------------------------------------------------- /RawIntrinsics/SSSE3.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class SSSE3 4 | { 5 | /// 6 | /// Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". 7 | /// 8 | /// PABSW xmm, xmm 9 | /// __m128i {SI16} 10 | /// __m128i dst {UI16} 11 | public static __m128i _mm_abs_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Ssse3.Abs(a.SI16); 12 | 13 | /// 14 | /// Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". 15 | /// 16 | /// PABSD xmm, xmm 17 | /// __m128i {SI32} 18 | /// __m128i dst {UI32} 19 | public static __m128i _mm_abs_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Ssse3.Abs(a.SI32); 20 | 21 | /// 22 | /// Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". 23 | /// 24 | /// PABSB xmm, xmm 25 | /// __m128i {SI8} 26 | /// __m128i dst {UI8} 27 | public static __m128i _mm_abs_epi8(__m128i a) => System.Runtime.Intrinsics.X86.Ssse3.Abs(a.SI8); 28 | 29 | /// 30 | /// Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". 31 | /// 32 | /// PALIGNR xmm, xmm, imm8 33 | /// __m128i {UI8} 34 | /// __m128i {UI8} 35 | /// int {IMM} 36 | /// __m128i dst {UI8} 37 | public static __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Ssse3.AlignRight(a.UI8, b.UI8, (byte)imm8); 38 | 39 | /// 40 | /// Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". 41 | /// 42 | /// PHADDW xmm, xmm 43 | /// __m128i {SI16} 44 | /// __m128i {SI16} 45 | /// __m128i dst {SI16} 46 | public static __m128i _mm_hadd_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalAdd(a.SI16, b.SI16); 47 | 48 | /// 49 | /// Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". 50 | /// 51 | /// PHADDD xmm, xmm 52 | /// __m128i {SI32} 53 | /// __m128i {SI32} 54 | /// __m128i dst {SI32} 55 | public static __m128i _mm_hadd_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalAdd(a.SI32, b.SI32); 56 | 57 | /// 58 | /// Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". 59 | /// 60 | /// PHADDSW xmm, xmm 61 | /// __m128i {SI16} 62 | /// __m128i {SI16} 63 | /// __m128i dst {SI16} 64 | public static __m128i _mm_hadds_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalAddSaturate(a.SI16, b.SI16); 65 | 66 | /// 67 | /// Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". 68 | /// 69 | /// PHSUBW xmm, xmm 70 | /// __m128i {SI16} 71 | /// __m128i {SI16} 72 | /// __m128i dst {SI16} 73 | public static __m128i _mm_hsub_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalSubtract(a.SI16, b.SI16); 74 | 75 | /// 76 | /// Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". 77 | /// 78 | /// PHSUBD xmm, xmm 79 | /// __m128i {SI32} 80 | /// __m128i {SI32} 81 | /// __m128i dst {SI32} 82 | public static __m128i _mm_hsub_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalSubtract(a.SI32, b.SI32); 83 | 84 | /// 85 | /// Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". 86 | /// 87 | /// PHSUBSW xmm, xmm 88 | /// __m128i {SI16} 89 | /// __m128i {SI16} 90 | /// __m128i dst {SI16} 91 | public static __m128i _mm_hsubs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.HorizontalSubtractSaturate(a.SI16, b.SI16); 92 | 93 | /// 94 | /// Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". 95 | /// 96 | /// PMADDUBSW xmm, xmm 97 | /// __m128i {UI8} 98 | /// __m128i {SI8} 99 | /// __m128i dst {SI16} 100 | public static __m128i _mm_maddubs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(a.UI8, b.SI8); 101 | 102 | /// 103 | /// Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". 104 | /// 105 | /// PMULHRSW xmm, xmm 106 | /// __m128i {SI16} 107 | /// __m128i {SI16} 108 | /// __m128i dst {UI16} 109 | public static __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.MultiplyHighRoundScale(a.SI16, b.SI16); 110 | 111 | /// 112 | /// Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". 113 | /// 114 | /// PSHUFB xmm, xmm 115 | /// __m128i {UI8} 116 | /// __m128i {UI8} 117 | /// __m128i dst {UI8} 118 | public static __m128i _mm_shuffle_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Shuffle(a.UI8, b.UI8); 119 | 120 | /// 121 | /// Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. 122 | /// 123 | /// PSIGNW xmm, xmm 124 | /// __m128i {SI16} 125 | /// __m128i {SI16} 126 | /// __m128i dst {UI16} 127 | public static __m128i _mm_sign_epi16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Sign(a.SI16, b.SI16); 128 | 129 | /// 130 | /// Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. 131 | /// 132 | /// PSIGND xmm, xmm 133 | /// __m128i {SI32} 134 | /// __m128i {SI32} 135 | /// __m128i dst {UI32} 136 | public static __m128i _mm_sign_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Sign(a.SI32, b.SI32); 137 | 138 | /// 139 | /// Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. 140 | /// 141 | /// PSIGNB xmm, xmm 142 | /// __m128i {SI8} 143 | /// __m128i {SI8} 144 | /// __m128i dst {UI8} 145 | public static __m128i _mm_sign_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Ssse3.Sign(a.SI8, b.SI8); 146 | 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /RawIntrinsics/Types.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public struct __m64 4 | { 5 | private System.Runtime.Intrinsics.Vector64 _; 6 | public System.Runtime.Intrinsics.Vector64 UI8 => System.Runtime.Intrinsics.Vector64.AsByte(_); 7 | public System.Runtime.Intrinsics.Vector64 SI8 => System.Runtime.Intrinsics.Vector64.AsSByte(_); 8 | public System.Runtime.Intrinsics.Vector64 UI16 => System.Runtime.Intrinsics.Vector64.AsUInt16(_); 9 | public System.Runtime.Intrinsics.Vector64 SI16 => System.Runtime.Intrinsics.Vector64.AsInt16(_); 10 | public System.Runtime.Intrinsics.Vector64 UI32 => System.Runtime.Intrinsics.Vector64.AsUInt32(_); 11 | public System.Runtime.Intrinsics.Vector64 SI32 => System.Runtime.Intrinsics.Vector64.AsInt32(_); 12 | public System.Runtime.Intrinsics.Vector64 UI64 => System.Runtime.Intrinsics.Vector64.AsUInt64(_); 13 | public System.Runtime.Intrinsics.Vector64 SI64 => System.Runtime.Intrinsics.Vector64.AsInt64(_); 14 | public System.Runtime.Intrinsics.Vector64 FP32 => System.Runtime.Intrinsics.Vector64.AsSingle(_); 15 | public System.Runtime.Intrinsics.Vector64 FP64 => System.Runtime.Intrinsics.Vector64.AsDouble(_); 16 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 17 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 18 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 19 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 20 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 21 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 22 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 23 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 24 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 25 | public static implicit operator __m64(System.Runtime.Intrinsics.Vector64 v) => new __m64 { _ = System.Runtime.Intrinsics.Vector64.AsByte(v) }; 26 | } 27 | 28 | public struct __m128 29 | { 30 | private System.Runtime.Intrinsics.Vector128 _; 31 | public System.Runtime.Intrinsics.Vector128 UI8 => System.Runtime.Intrinsics.Vector128.AsByte(_); 32 | public System.Runtime.Intrinsics.Vector128 SI8 => System.Runtime.Intrinsics.Vector128.AsSByte(_); 33 | public System.Runtime.Intrinsics.Vector128 UI16 => System.Runtime.Intrinsics.Vector128.AsUInt16(_); 34 | public System.Runtime.Intrinsics.Vector128 SI16 => System.Runtime.Intrinsics.Vector128.AsInt16(_); 35 | public System.Runtime.Intrinsics.Vector128 UI32 => System.Runtime.Intrinsics.Vector128.AsUInt32(_); 36 | public System.Runtime.Intrinsics.Vector128 SI32 => System.Runtime.Intrinsics.Vector128.AsInt32(_); 37 | public System.Runtime.Intrinsics.Vector128 UI64 => System.Runtime.Intrinsics.Vector128.AsUInt64(_); 38 | public System.Runtime.Intrinsics.Vector128 SI64 => System.Runtime.Intrinsics.Vector128.AsInt64(_); 39 | public System.Runtime.Intrinsics.Vector128 FP32 => System.Runtime.Intrinsics.Vector128.AsSingle(_); 40 | public System.Runtime.Intrinsics.Vector128 FP64 => System.Runtime.Intrinsics.Vector128.AsDouble(_); 41 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 42 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 43 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 44 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 45 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 46 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 47 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 48 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 49 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 50 | public static implicit operator __m128(System.Runtime.Intrinsics.Vector128 v) => new __m128 { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 51 | } 52 | 53 | public struct __m128i 54 | { 55 | private System.Runtime.Intrinsics.Vector128 _; 56 | public System.Runtime.Intrinsics.Vector128 UI8 => System.Runtime.Intrinsics.Vector128.AsByte(_); 57 | public System.Runtime.Intrinsics.Vector128 SI8 => System.Runtime.Intrinsics.Vector128.AsSByte(_); 58 | public System.Runtime.Intrinsics.Vector128 UI16 => System.Runtime.Intrinsics.Vector128.AsUInt16(_); 59 | public System.Runtime.Intrinsics.Vector128 SI16 => System.Runtime.Intrinsics.Vector128.AsInt16(_); 60 | public System.Runtime.Intrinsics.Vector128 UI32 => System.Runtime.Intrinsics.Vector128.AsUInt32(_); 61 | public System.Runtime.Intrinsics.Vector128 SI32 => System.Runtime.Intrinsics.Vector128.AsInt32(_); 62 | public System.Runtime.Intrinsics.Vector128 UI64 => System.Runtime.Intrinsics.Vector128.AsUInt64(_); 63 | public System.Runtime.Intrinsics.Vector128 SI64 => System.Runtime.Intrinsics.Vector128.AsInt64(_); 64 | public System.Runtime.Intrinsics.Vector128 FP32 => System.Runtime.Intrinsics.Vector128.AsSingle(_); 65 | public System.Runtime.Intrinsics.Vector128 FP64 => System.Runtime.Intrinsics.Vector128.AsDouble(_); 66 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 67 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 68 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 69 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 70 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 71 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 72 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 73 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 74 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 75 | public static implicit operator __m128i(System.Runtime.Intrinsics.Vector128 v) => new __m128i { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 76 | } 77 | 78 | public struct __m128d 79 | { 80 | private System.Runtime.Intrinsics.Vector128 _; 81 | public System.Runtime.Intrinsics.Vector128 UI8 => System.Runtime.Intrinsics.Vector128.AsByte(_); 82 | public System.Runtime.Intrinsics.Vector128 SI8 => System.Runtime.Intrinsics.Vector128.AsSByte(_); 83 | public System.Runtime.Intrinsics.Vector128 UI16 => System.Runtime.Intrinsics.Vector128.AsUInt16(_); 84 | public System.Runtime.Intrinsics.Vector128 SI16 => System.Runtime.Intrinsics.Vector128.AsInt16(_); 85 | public System.Runtime.Intrinsics.Vector128 UI32 => System.Runtime.Intrinsics.Vector128.AsUInt32(_); 86 | public System.Runtime.Intrinsics.Vector128 SI32 => System.Runtime.Intrinsics.Vector128.AsInt32(_); 87 | public System.Runtime.Intrinsics.Vector128 UI64 => System.Runtime.Intrinsics.Vector128.AsUInt64(_); 88 | public System.Runtime.Intrinsics.Vector128 SI64 => System.Runtime.Intrinsics.Vector128.AsInt64(_); 89 | public System.Runtime.Intrinsics.Vector128 FP32 => System.Runtime.Intrinsics.Vector128.AsSingle(_); 90 | public System.Runtime.Intrinsics.Vector128 FP64 => System.Runtime.Intrinsics.Vector128.AsDouble(_); 91 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 92 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 93 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 94 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 95 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 96 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 97 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 98 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 99 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 100 | public static implicit operator __m128d(System.Runtime.Intrinsics.Vector128 v) => new __m128d { _ = System.Runtime.Intrinsics.Vector128.AsByte(v) }; 101 | } 102 | 103 | public struct __m256 104 | { 105 | private System.Runtime.Intrinsics.Vector256 _; 106 | public System.Runtime.Intrinsics.Vector256 UI8 => System.Runtime.Intrinsics.Vector256.AsByte(_); 107 | public System.Runtime.Intrinsics.Vector256 SI8 => System.Runtime.Intrinsics.Vector256.AsSByte(_); 108 | public System.Runtime.Intrinsics.Vector256 UI16 => System.Runtime.Intrinsics.Vector256.AsUInt16(_); 109 | public System.Runtime.Intrinsics.Vector256 SI16 => System.Runtime.Intrinsics.Vector256.AsInt16(_); 110 | public System.Runtime.Intrinsics.Vector256 UI32 => System.Runtime.Intrinsics.Vector256.AsUInt32(_); 111 | public System.Runtime.Intrinsics.Vector256 SI32 => System.Runtime.Intrinsics.Vector256.AsInt32(_); 112 | public System.Runtime.Intrinsics.Vector256 UI64 => System.Runtime.Intrinsics.Vector256.AsUInt64(_); 113 | public System.Runtime.Intrinsics.Vector256 SI64 => System.Runtime.Intrinsics.Vector256.AsInt64(_); 114 | public System.Runtime.Intrinsics.Vector256 FP32 => System.Runtime.Intrinsics.Vector256.AsSingle(_); 115 | public System.Runtime.Intrinsics.Vector256 FP64 => System.Runtime.Intrinsics.Vector256.AsDouble(_); 116 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 117 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 118 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 119 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 120 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 121 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 122 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 123 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 124 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 125 | public static implicit operator __m256(System.Runtime.Intrinsics.Vector256 v) => new __m256 { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 126 | } 127 | 128 | public struct __m256i 129 | { 130 | private System.Runtime.Intrinsics.Vector256 _; 131 | public System.Runtime.Intrinsics.Vector256 UI8 => System.Runtime.Intrinsics.Vector256.AsByte(_); 132 | public System.Runtime.Intrinsics.Vector256 SI8 => System.Runtime.Intrinsics.Vector256.AsSByte(_); 133 | public System.Runtime.Intrinsics.Vector256 UI16 => System.Runtime.Intrinsics.Vector256.AsUInt16(_); 134 | public System.Runtime.Intrinsics.Vector256 SI16 => System.Runtime.Intrinsics.Vector256.AsInt16(_); 135 | public System.Runtime.Intrinsics.Vector256 UI32 => System.Runtime.Intrinsics.Vector256.AsUInt32(_); 136 | public System.Runtime.Intrinsics.Vector256 SI32 => System.Runtime.Intrinsics.Vector256.AsInt32(_); 137 | public System.Runtime.Intrinsics.Vector256 UI64 => System.Runtime.Intrinsics.Vector256.AsUInt64(_); 138 | public System.Runtime.Intrinsics.Vector256 SI64 => System.Runtime.Intrinsics.Vector256.AsInt64(_); 139 | public System.Runtime.Intrinsics.Vector256 FP32 => System.Runtime.Intrinsics.Vector256.AsSingle(_); 140 | public System.Runtime.Intrinsics.Vector256 FP64 => System.Runtime.Intrinsics.Vector256.AsDouble(_); 141 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 142 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 143 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 144 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 145 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 146 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 147 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 148 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 149 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 150 | public static implicit operator __m256i(System.Runtime.Intrinsics.Vector256 v) => new __m256i { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 151 | } 152 | 153 | public struct __m256d 154 | { 155 | private System.Runtime.Intrinsics.Vector256 _; 156 | public System.Runtime.Intrinsics.Vector256 UI8 => System.Runtime.Intrinsics.Vector256.AsByte(_); 157 | public System.Runtime.Intrinsics.Vector256 SI8 => System.Runtime.Intrinsics.Vector256.AsSByte(_); 158 | public System.Runtime.Intrinsics.Vector256 UI16 => System.Runtime.Intrinsics.Vector256.AsUInt16(_); 159 | public System.Runtime.Intrinsics.Vector256 SI16 => System.Runtime.Intrinsics.Vector256.AsInt16(_); 160 | public System.Runtime.Intrinsics.Vector256 UI32 => System.Runtime.Intrinsics.Vector256.AsUInt32(_); 161 | public System.Runtime.Intrinsics.Vector256 SI32 => System.Runtime.Intrinsics.Vector256.AsInt32(_); 162 | public System.Runtime.Intrinsics.Vector256 UI64 => System.Runtime.Intrinsics.Vector256.AsUInt64(_); 163 | public System.Runtime.Intrinsics.Vector256 SI64 => System.Runtime.Intrinsics.Vector256.AsInt64(_); 164 | public System.Runtime.Intrinsics.Vector256 FP32 => System.Runtime.Intrinsics.Vector256.AsSingle(_); 165 | public System.Runtime.Intrinsics.Vector256 FP64 => System.Runtime.Intrinsics.Vector256.AsDouble(_); 166 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 167 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 168 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 169 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 170 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 171 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 172 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 173 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 174 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 175 | public static implicit operator __m256d(System.Runtime.Intrinsics.Vector256 v) => new __m256d { _ = System.Runtime.Intrinsics.Vector256.AsByte(v) }; 176 | } 177 | 178 | } 179 | -------------------------------------------------------------------------------- /RawIntrinsics/FMA.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class FMA 4 | { 5 | /// 6 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 7 | /// 8 | /// VFMADD132PD xmm, xmm, xmm 9 | /// __m128d {FP64} 10 | /// __m128d {FP64} 11 | /// __m128d {FP64} 12 | /// __m128d dst {FP64} 13 | public static __m128d _mm_fmadd_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP64, b.FP64, c.FP64); 14 | 15 | /// 16 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 17 | /// 18 | /// VFMADD132PS xmm, xmm, xmm 19 | /// __m128 {FP32} 20 | /// __m128 {FP32} 21 | /// __m128 {FP32} 22 | /// __m128 dst {FP32} 23 | public static __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP32, b.FP32, c.FP32); 24 | 25 | /// 26 | /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 27 | /// 28 | /// VFMADD132SD xmm, xmm, xmm 29 | /// __m128d {FP64} 30 | /// __m128d {FP64} 31 | /// __m128d {FP64} 32 | /// __m128d dst {FP64} 33 | public static __m128d _mm_fmadd_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddScalar(a.FP64, b.FP64, c.FP64); 34 | 35 | /// 36 | /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 37 | /// 38 | /// VFMADD132SS xmm, xmm, xmm 39 | /// __m128 {FP32} 40 | /// __m128 {FP32} 41 | /// __m128 {FP32} 42 | /// __m128 dst {FP32} 43 | public static __m128 _mm_fmadd_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddScalar(a.FP32, b.FP32, c.FP32); 44 | 45 | /// 46 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 47 | /// 48 | /// VFMADDSUB132PD xmm, xmm, xmm 49 | /// __m128d {FP64} 50 | /// __m128d {FP64} 51 | /// __m128d {FP64} 52 | /// __m128d dst {FP64} 53 | public static __m128d _mm_fmaddsub_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP64, b.FP64, c.FP64); 54 | 55 | /// 56 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 57 | /// 58 | /// VFMADDSUB132PS xmm, xmm, xmm 59 | /// __m128 {FP32} 60 | /// __m128 {FP32} 61 | /// __m128 {FP32} 62 | /// __m128 dst {FP32} 63 | public static __m128 _mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP32, b.FP32, c.FP32); 64 | 65 | /// 66 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 67 | /// 68 | /// VFMSUB132PD xmm, xmm, xmm 69 | /// __m128d {FP64} 70 | /// __m128d {FP64} 71 | /// __m128d {FP64} 72 | /// __m128d dst {FP64} 73 | public static __m128d _mm_fmsub_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP64, b.FP64, c.FP64); 74 | 75 | /// 76 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 77 | /// 78 | /// VFMSUB132PS xmm, xmm, xmm 79 | /// __m128 {FP32} 80 | /// __m128 {FP32} 81 | /// __m128 {FP32} 82 | /// __m128 dst {FP32} 83 | public static __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP32, b.FP32, c.FP32); 84 | 85 | /// 86 | /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 87 | /// 88 | /// VFMSUB132SD xmm, xmm, xmm 89 | /// __m128d {FP64} 90 | /// __m128d {FP64} 91 | /// __m128d {FP64} 92 | /// __m128d dst {FP64} 93 | public static __m128d _mm_fmsub_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractScalar(a.FP64, b.FP64, c.FP64); 94 | 95 | /// 96 | /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 97 | /// 98 | /// VFMSUB132SS xmm, xmm, xmm 99 | /// __m128 {FP32} 100 | /// __m128 {FP32} 101 | /// __m128 {FP32} 102 | /// __m128 dst {FP32} 103 | public static __m128 _mm_fmsub_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractScalar(a.FP32, b.FP32, c.FP32); 104 | 105 | /// 106 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 107 | /// 108 | /// VFMSUBADD132PD xmm, xmm, xmm 109 | /// __m128d {FP64} 110 | /// __m128d {FP64} 111 | /// __m128d {FP64} 112 | /// __m128d dst {FP64} 113 | public static __m128d _mm_fmsubadd_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP64, b.FP64, c.FP64); 114 | 115 | /// 116 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 117 | /// 118 | /// VFMSUBADD132PS xmm, xmm, xmm 119 | /// __m128 {FP32} 120 | /// __m128 {FP32} 121 | /// __m128 {FP32} 122 | /// __m128 dst {FP32} 123 | public static __m128 _mm_fmsubadd_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP32, b.FP32, c.FP32); 124 | 125 | /// 126 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 127 | /// 128 | /// VFNMADD132PD xmm, xmm, xmm 129 | /// __m128d {FP64} 130 | /// __m128d {FP64} 131 | /// __m128d {FP64} 132 | /// __m128d dst {FP64} 133 | public static __m128d _mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP64, b.FP64, c.FP64); 134 | 135 | /// 136 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 137 | /// 138 | /// VFNMADD132PS xmm, xmm, xmm 139 | /// __m128 {FP32} 140 | /// __m128 {FP32} 141 | /// __m128 {FP32} 142 | /// __m128 dst {FP32} 143 | public static __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP32, b.FP32, c.FP32); 144 | 145 | /// 146 | /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 147 | /// 148 | /// VFNMADD132SD xmm, xmm, xmm 149 | /// __m128d {FP64} 150 | /// __m128d {FP64} 151 | /// __m128d {FP64} 152 | /// __m128d dst {FP64} 153 | public static __m128d _mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegatedScalar(a.FP64, b.FP64, c.FP64); 154 | 155 | /// 156 | /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 157 | /// 158 | /// VFNMADD132SS xmm, xmm, xmm 159 | /// __m128 {FP32} 160 | /// __m128 {FP32} 161 | /// __m128 {FP32} 162 | /// __m128 dst {FP32} 163 | public static __m128 _mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegatedScalar(a.FP32, b.FP32, c.FP32); 164 | 165 | /// 166 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 167 | /// 168 | /// VFNMSUB132PD xmm, xmm, xmm 169 | /// __m128d {FP64} 170 | /// __m128d {FP64} 171 | /// __m128d {FP64} 172 | /// __m128d dst {FP64} 173 | public static __m128d _mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP64, b.FP64, c.FP64); 174 | 175 | /// 176 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 177 | /// 178 | /// VFNMSUB132PS xmm, xmm, xmm 179 | /// __m128 {FP32} 180 | /// __m128 {FP32} 181 | /// __m128 {FP32} 182 | /// __m128 dst {FP32} 183 | public static __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP32, b.FP32, c.FP32); 184 | 185 | /// 186 | /// Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 187 | /// 188 | /// VFNMSUB132SD xmm, xmm, xmm 189 | /// __m128d {FP64} 190 | /// __m128d {FP64} 191 | /// __m128d {FP64} 192 | /// __m128d dst {FP64} 193 | public static __m128d _mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegatedScalar(a.FP64, b.FP64, c.FP64); 194 | 195 | /// 196 | /// Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 197 | /// 198 | /// VFNMSUB132SS xmm, xmm, xmm 199 | /// __m128 {FP32} 200 | /// __m128 {FP32} 201 | /// __m128 {FP32} 202 | /// __m128 dst {FP32} 203 | public static __m128 _mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegatedScalar(a.FP32, b.FP32, c.FP32); 204 | 205 | /// 206 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 207 | /// 208 | /// VFMADD132PD ymm, ymm, ymm 209 | /// __m256d {FP64} 210 | /// __m256d {FP64} 211 | /// __m256d {FP64} 212 | /// __m256d dst {FP64} 213 | public static __m256d _mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP64, b.FP64, c.FP64); 214 | 215 | /// 216 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 217 | /// 218 | /// VFMADD132PS ymm, ymm, ymm 219 | /// __m256 {FP32} 220 | /// __m256 {FP32} 221 | /// __m256 {FP32} 222 | /// __m256 dst {FP32} 223 | public static __m256 _mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(a.FP32, b.FP32, c.FP32); 224 | 225 | /// 226 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 227 | /// 228 | /// VFMADDSUB132PD ymm, ymm, ymm 229 | /// __m256d {FP64} 230 | /// __m256d {FP64} 231 | /// __m256d {FP64} 232 | /// __m256d dst {FP64} 233 | public static __m256d _mm256_fmaddsub_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP64, b.FP64, c.FP64); 234 | 235 | /// 236 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 237 | /// 238 | /// VFMADDSUB132PS ymm, ymm, ymm 239 | /// __m256 {FP32} 240 | /// __m256 {FP32} 241 | /// __m256 {FP32} 242 | /// __m256 dst {FP32} 243 | public static __m256 _mm256_fmaddsub_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddSubtract(a.FP32, b.FP32, c.FP32); 244 | 245 | /// 246 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 247 | /// 248 | /// VFMSUB132PD ymm, ymm, ymm 249 | /// __m256d {FP64} 250 | /// __m256d {FP64} 251 | /// __m256d {FP64} 252 | /// __m256d dst {FP64} 253 | public static __m256d _mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP64, b.FP64, c.FP64); 254 | 255 | /// 256 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 257 | /// 258 | /// VFMSUB132PS ymm, ymm, ymm 259 | /// __m256 {FP32} 260 | /// __m256 {FP32} 261 | /// __m256 {FP32} 262 | /// __m256 dst {FP32} 263 | public static __m256 _mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtract(a.FP32, b.FP32, c.FP32); 264 | 265 | /// 266 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 267 | /// 268 | /// VFMSUBADD132PD ymm, ymm, ymm 269 | /// __m256d {FP64} 270 | /// __m256d {FP64} 271 | /// __m256d {FP64} 272 | /// __m256d dst {FP64} 273 | public static __m256d _mm256_fmsubadd_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP64, b.FP64, c.FP64); 274 | 275 | /// 276 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 277 | /// 278 | /// VFMSUBADD132PS ymm, ymm, ymm 279 | /// __m256 {FP32} 280 | /// __m256 {FP32} 281 | /// __m256 {FP32} 282 | /// __m256 dst {FP32} 283 | public static __m256 _mm256_fmsubadd_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractAdd(a.FP32, b.FP32, c.FP32); 284 | 285 | /// 286 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 287 | /// 288 | /// VFNMADD132PD ymm, ymm, ymm 289 | /// __m256d {FP64} 290 | /// __m256d {FP64} 291 | /// __m256d {FP64} 292 | /// __m256d dst {FP64} 293 | public static __m256d _mm256_fnmadd_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP64, b.FP64, c.FP64); 294 | 295 | /// 296 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 297 | /// 298 | /// VFNMADD132PS ymm, ymm, ymm 299 | /// __m256 {FP32} 300 | /// __m256 {FP32} 301 | /// __m256 {FP32} 302 | /// __m256 dst {FP32} 303 | public static __m256 _mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplyAddNegated(a.FP32, b.FP32, c.FP32); 304 | 305 | /// 306 | /// Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 307 | /// 308 | /// VFNMSUB132PD ymm, ymm, ymm 309 | /// __m256d {FP64} 310 | /// __m256d {FP64} 311 | /// __m256d {FP64} 312 | /// __m256d dst {FP64} 313 | public static __m256d _mm256_fnmsub_pd(__m256d a, __m256d b, __m256d c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP64, b.FP64, c.FP64); 314 | 315 | /// 316 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 317 | /// 318 | /// VFNMSUB132PS ymm, ymm, ymm 319 | /// __m256 {FP32} 320 | /// __m256 {FP32} 321 | /// __m256 {FP32} 322 | /// __m256 dst {FP32} 323 | public static __m256 _mm256_fnmsub_ps(__m256 a, __m256 b, __m256 c) => System.Runtime.Intrinsics.X86.Fma.MultiplySubtractNegated(a.FP32, b.FP32, c.FP32); 324 | 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /RawIntrinsicsGenerator/Generator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Concurrent; 3 | using System.Collections.Generic; 4 | using System.Diagnostics; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Net.Http; 8 | using System.Runtime.Intrinsics; 9 | using System.Text; 10 | using System.Text.RegularExpressions; 11 | using System.Threading.Tasks; 12 | using System.Xml; 13 | using Microsoft.CodeAnalysis; 14 | using Microsoft.CodeAnalysis.CSharp; 15 | using Microsoft.CodeAnalysis.CSharp.Syntax; 16 | 17 | namespace RawIntrinsicsGenerator 18 | { 19 | public static class Generator 20 | { 21 | private const string SriDataUrl1 = @"https://raw.githubusercontent.com/dotnet/runtime/master/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/"; 22 | private const string SriDataUrl2 = @"https://raw.githubusercontent.com/dotnet/runtime/master/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/"; 23 | private const string IntelDataUrl = @"https://software.intel.com/sites/landingpage/IntrinsicsGuide/files/data-latest.xml"; 24 | 25 | private static readonly Regex IntelMethodSignature = new(@"///\s+?(?[\w_]+)\s+?(?_mm[\w_]+)\s*?\((?[\w\s,*]+)\)", RegexOptions.Compiled); 26 | private static readonly Regex IntelMethodSignatureSimpilfied = new(@"\s+?(?[\w_]+)\s+?(?_mm[\w_]+)\s*?", RegexOptions.Compiled); 27 | private static readonly Regex IntelTypeDef = new(@"(?:(?unsigned)\s+?)?(?:const\s+)?(?void|char|short|int|long|long\s+?long|float|double|__int32|__int64|(?:(?:__m64|__m128|__m256)(?:i|d)?)|__mmask8|__mmask16|__mmask32|__mmask64)[^*""]*(?\*)?", RegexOptions.Compiled); 28 | 29 | private static readonly Dictionary TechnologyMap = new() 30 | { 31 | {"Sse", (SriDataUrl1, IntelMethodSignature)}, 32 | {"Sse2", (SriDataUrl1, IntelMethodSignature)}, 33 | {"Sse3", (SriDataUrl1, IntelMethodSignature)}, 34 | {"Sse41", (SriDataUrl1, IntelMethodSignature)}, 35 | {"Sse42", (SriDataUrl1, IntelMethodSignature)}, 36 | {"Ssse3", (SriDataUrl1, IntelMethodSignature)}, 37 | {"Avx", (SriDataUrl1, IntelMethodSignature)}, 38 | {"Avx2", (SriDataUrl1, IntelMethodSignature)}, 39 | {"Fma", (SriDataUrl1, IntelMethodSignature)}, 40 | {"Aes", (SriDataUrl1, IntelMethodSignature)}, 41 | {"Bmi1", (SriDataUrl1, IntelMethodSignature)}, 42 | {"Bmi2", (SriDataUrl1, IntelMethodSignature)}, 43 | {"Lzcnt", (SriDataUrl1, IntelMethodSignature)}, 44 | {"Popcnt", (SriDataUrl1, IntelMethodSignature)}, 45 | {"Pclmulqdq", (SriDataUrl1, IntelMethodSignature)}, 46 | {"Vector64", (SriDataUrl2, IntelMethodSignatureSimpilfied)}, 47 | {"Vector128", (SriDataUrl2, IntelMethodSignatureSimpilfied)}, 48 | {"Vector256", (SriDataUrl2, IntelMethodSignatureSimpilfied)}, 49 | }; 50 | 51 | public static async Task Generate(string ns, string saveToPath) 52 | { 53 | var intelDataFile = await FetchFileContent(IntelDataUrl); 54 | 55 | var xml = new XmlDocument(); 56 | xml.LoadXml(intelDataFile); 57 | var intelData = new ConcurrentBag(xml.SelectNodes(@"//intrinsic")?.Cast().ToList() ?? new List()); 58 | 59 | var outputData = new ConcurrentDictionary>(); 60 | 61 | foreach (var kv in TechnologyMap) 62 | { 63 | await Generate($"{kv.Value.srcUrl}{kv.Key}.cs", kv.Value.matcher, intelData, outputData); 64 | } 65 | 66 | if (Directory.Exists(saveToPath)) 67 | { 68 | foreach(var fi in new DirectoryInfo(saveToPath).GetFiles()) 69 | { 70 | fi.Delete(); 71 | } 72 | Directory.Delete(saveToPath); 73 | } 74 | Directory.CreateDirectory(saveToPath); 75 | 76 | var codeGenSb = new StringBuilder(); 77 | string tabOffset; 78 | foreach (var (tech, generatedSrc) in outputData) 79 | { 80 | codeGenSb.Clear(); 81 | tabOffset = ""; 82 | codeGenSb.AppendLine($"{tabOffset}namespace {ns}"); 83 | codeGenSb.AppendLine($"{tabOffset}{{"); 84 | 85 | tabOffset = "\t"; 86 | codeGenSb.AppendLine($"{tabOffset}public static unsafe partial class {tech}"); 87 | codeGenSb.AppendLine($"{tabOffset}{{"); 88 | 89 | foreach (var intelMethodName in generatedSrc.Keys.OrderBy(_ => _)) 90 | { 91 | codeGenSb.AppendLine(generatedSrc[intelMethodName]); 92 | } 93 | 94 | tabOffset = "\t"; 95 | codeGenSb.AppendLine($"{tabOffset}}}"); 96 | 97 | tabOffset = ""; 98 | codeGenSb.AppendLine($"{tabOffset}}}"); 99 | 100 | await File.WriteAllTextAsync(Path.Combine(saveToPath, $"{tech}.cs"), codeGenSb.ToString()); 101 | } 102 | 103 | codeGenSb.Clear(); 104 | tabOffset = ""; 105 | codeGenSb.AppendLine($"{tabOffset}namespace {ns}"); 106 | codeGenSb.AppendLine($"{tabOffset}{{"); 107 | 108 | foreach (var t in new[] {(64, null), (128, null), (128, "i"), (128, "d"), (256, null), (256, "i"), (256, "d")}) 109 | { 110 | var (size, pf) = t; 111 | codeGenSb.AppendLine(GenerateMType(size, pf)); 112 | } 113 | tabOffset = ""; 114 | codeGenSb.AppendLine($"{tabOffset}}}"); 115 | 116 | await File.WriteAllTextAsync(Path.Combine(saveToPath, $"Types.cs"), codeGenSb.ToString()); 117 | } 118 | 119 | private static readonly (string etype, string convFn)[] EtypeToReninterpretMethodMap = {("UI8", "AsByte"), ("SI8", "AsSByte"), ("UI16", "AsUInt16"), ("SI16", "AsInt16"), ("UI32", "AsUInt32"), ("SI32", "AsInt32"), ("UI64", "AsUInt64"), ("SI64", "AsInt64"), ("FP32", "AsSingle"), ("FP64", "AsDouble")}; 120 | 121 | private static string GenerateMType(int size, string pf = null) 122 | { 123 | var mTypeName = pf == null ? $"__m{size}" : $"__m{size}{pf}" ; 124 | var codeGenSb = new StringBuilder(); 125 | 126 | var csVectorTypeName = $"System.Runtime.Intrinsics.Vector{size}"; 127 | 128 | var tabOffset = "\t"; 129 | codeGenSb.AppendLine($"{tabOffset}public struct {mTypeName}"); 130 | codeGenSb.AppendLine($"{tabOffset}{{"); 131 | 132 | tabOffset = "\t\t"; 133 | codeGenSb.AppendLine($"{tabOffset}private {csVectorTypeName} _;"); 134 | 135 | foreach (var (etype, convFn) in EtypeToReninterpretMethodMap) 136 | { 137 | var csType = EtypeToCsTypeName(etype); 138 | codeGenSb.AppendLine($"{tabOffset}public {csVectorTypeName}<{csType}> {etype} => {csVectorTypeName}.{convFn}(_);"); 139 | } 140 | 141 | foreach (var (etype, _) in EtypeToReninterpretMethodMap) 142 | { 143 | var csType = EtypeToCsTypeName(etype); 144 | codeGenSb.AppendLine($"{tabOffset}public static implicit operator {mTypeName}({csVectorTypeName}<{csType}> v) => new {mTypeName} {{ _ = {csVectorTypeName}.AsByte(v) }};"); 145 | } 146 | 147 | tabOffset = "\t"; 148 | codeGenSb.AppendLine($"{tabOffset}}}"); 149 | return codeGenSb.ToString(); 150 | } 151 | 152 | private static async Task Generate(string sriUrl, Regex cppIntrinsicNameMatcher, ConcurrentBag intelData, ConcurrentDictionary> outputData) 153 | { 154 | var intelMethod2CsMethodMap = new Dictionary>(); 155 | 156 | var sriData = await FetchFileContent(sriUrl); 157 | 158 | var syntaxTree = CSharpSyntaxTree.ParseText(sriData); 159 | var compilation = CSharpCompilation.Create("Test").AddReferences(MetadataReference.CreateFromFile(typeof(object).Assembly.Location)).AddReferences(MetadataReference.CreateFromFile(typeof(Vector128).Assembly.Location)).AddSyntaxTrees(syntaxTree); 160 | var semanticModel = compilation.GetSemanticModel(syntaxTree); 161 | var syntaxTreeRoot = (CompilationUnitSyntax) await syntaxTree.GetRootAsync(); 162 | 163 | var methodDeclarations = syntaxTreeRoot.DescendantNodes(_ => true, true).OfType(); 164 | 165 | foreach (var methodDeclaration in methodDeclarations) 166 | { 167 | SyntaxTrivia comments = default; 168 | if (!methodDeclaration.HasLeadingTrivia || methodDeclaration.GetLeadingTrivia().All(t => (comments = t).Kind() != SyntaxKind.SingleLineDocumentationCommentTrivia || comments.GetStructure() is not DocumentationCommentTriviaSyntax)) continue; 169 | 170 | Match match = default; 171 | var _ = ((DocumentationCommentTriviaSyntax) comments.GetStructure()).Content.OfType().FirstOrDefault(x => (match = cppIntrinsicNameMatcher.Match(x.Content.ToFullString())).Success); 172 | 173 | if (!match.Success) continue; 174 | 175 | var methodSymbol = semanticModel.GetDeclaredSymbol(methodDeclaration); 176 | var csMethod = new CsMethod 177 | { 178 | Name = methodDeclaration.Identifier.ToString(), 179 | ClassPath = methodSymbol.ReceiverType.ToDisplayString(), 180 | Parameters = new CsMethodParam[methodSymbol.Parameters.Length] 181 | }; 182 | 183 | if (IsCsIntrinsicType(methodSymbol.ReturnType.Name)) 184 | { 185 | csMethod.ReturnType.Name = methodSymbol.ReturnType.Name; 186 | csMethod.ReturnType.TypeParameter = methodDeclaration.ReturnType is GenericNameSyntax returnType ? returnType.TypeArgumentList.Arguments[0].ToString() : null; 187 | } 188 | else if (methodSymbol.ReturnType is not INamedTypeSymbol {IsGenericType: true}) 189 | { 190 | if (methodDeclaration.ReturnType is PointerTypeSyntax) 191 | { 192 | csMethod.ReturnType.Name = ((IPointerTypeSymbol) methodSymbol.ReturnType).PointedAtType.ToDisplayString(); 193 | csMethod.ReturnType.IsPointer = true; 194 | } 195 | else 196 | { 197 | csMethod.ReturnType.Name = methodSymbol.ReturnType.ToDisplayString(); 198 | } 199 | } 200 | else 201 | { 202 | throw new InvalidOperationException($"Unknown return type {methodSymbol.ReturnType.Name}"); 203 | } 204 | 205 | for (var j = 0; j < methodSymbol.Parameters.Length; j++) 206 | { 207 | var parameter = methodDeclaration.ParameterList.Parameters[j]; 208 | 209 | var parameterSymbol = methodSymbol.Parameters[j]; 210 | var csParameter = new CsMethodParam {Name = parameterSymbol.Name}; 211 | if (parameterSymbol.Type is not INamedTypeSymbol {IsGenericType: true} || !IsCsIntrinsicType(parameterSymbol.Type.Name)) 212 | { 213 | if (parameter.Type is PointerTypeSyntax) 214 | { 215 | csParameter.Type = new CsType 216 | { 217 | Name = ((IPointerTypeSymbol) parameterSymbol.Type).PointedAtType.ToDisplayString(), 218 | IsPointer = true 219 | }; 220 | } 221 | else 222 | { 223 | csParameter.Type = new CsType {Name = parameterSymbol.Type.ToDisplayString()}; 224 | } 225 | 226 | csMethod.Parameters[j] = csParameter; 227 | continue; 228 | } 229 | 230 | var parameterTypeArgument = parameter.Type is GenericNameSyntax parameterType ? parameterType.TypeArgumentList.Arguments[0].ToString() : null; 231 | csParameter.Type = new CsType 232 | { 233 | Name = parameterSymbol.Type.Name, 234 | TypeParameter = parameterTypeArgument 235 | }; 236 | csMethod.Parameters[j] = csParameter; 237 | } 238 | 239 | var intelName = match.Groups["fn"].Value; 240 | 241 | if (!intelMethod2CsMethodMap.ContainsKey(intelName)) 242 | { 243 | intelMethod2CsMethodMap[intelName] = new List(); 244 | } 245 | 246 | intelMethod2CsMethodMap[intelName].Add(csMethod); 247 | } 248 | 249 | foreach (var (intelMethodName, csMethods) in intelMethod2CsMethodMap) 250 | { 251 | var intelDataNode = intelData.FirstOrDefault(x => x.Attributes?.GetNamedItem("name")?.Value?.AsSpan().Equals(intelMethodName, StringComparison.InvariantCultureIgnoreCase) ?? false); 252 | if (intelDataNode == null) 253 | { 254 | Debug.WriteLine(intelMethodName); 255 | continue; 256 | } 257 | 258 | var tech = intelDataNode?.Attributes?.GetNamedItem("tech")?.Value.Replace(".", ""); 259 | var intelDataNodeReturn = intelDataNode?.SelectSingleNode("return"); 260 | 261 | var intelMethod = new IntelMethod 262 | { 263 | Name = intelDataNode?.Attributes?.GetNamedItem("name")?.Value, 264 | Return = new IntelMethodParam 265 | { 266 | Name = intelDataNodeReturn?.Attributes?.GetNamedItem("varname")?.Value, 267 | Type = ParseIntelType(intelDataNodeReturn?.Attributes?.GetNamedItem("type")?.Value, intelDataNodeReturn?.Attributes?.GetNamedItem("etype")?.Value) 268 | }, 269 | Description = intelDataNode?.SelectNodes(@"description")?.Cast().Select(n => n.InnerText.Replace(Environment.NewLine, "")).FirstOrDefault(), 270 | Instructions = intelDataNode?.SelectNodes(@"instruction")?.Cast().Select(n => $"{n?.Attributes?.GetNamedItem("name")?.Value} {n?.Attributes?.GetNamedItem("form")?.Value}").FirstOrDefault(), 271 | }; 272 | 273 | var intelMethodParameters = intelDataNode?.SelectNodes(@"parameter")?.Cast().Select(x => new IntelMethodParam 274 | { 275 | Name = x.Attributes?.GetNamedItem("varname")?.Value, 276 | Type = ParseIntelType(x.Attributes?.GetNamedItem("type")?.Value, x.Attributes?.GetNamedItem("etype")?.Value) 277 | }).ToArray(); 278 | intelMethod.Parameters = intelMethodParameters.Where(x => x.Type.Name != "void" || x.Type.IsPointer).ToArray(); 279 | 280 | if (csMethods.Count == 0) 281 | { 282 | throw new InvalidOperationException($"No method matching Intel's {intelMethodName} found in SR.Intrinsics namespace"); 283 | } 284 | 285 | var csMethod = FindMostSuited(intelMethod, csMethods); 286 | if (!csMethod.ReturnType.IsPointer && csMethod.ReturnType.Name == "bool" && intelMethod.Return.Type.Name == "int") 287 | { 288 | intelMethod.Return.Type = new IntelType 289 | { 290 | Name = csMethod.ReturnType.Name, 291 | CsType = csMethod.ReturnType, 292 | Hint = "UI8" 293 | }; 294 | } 295 | 296 | var mappedParameters = new List(); 297 | for (var k = 0; k < intelMethod.Parameters.Length; k++) 298 | { 299 | if (csMethod.Parameters.Length == k) break; 300 | var intelMethodParameter = intelMethod.Parameters[k]; 301 | var csMethodParameter = csMethod.Parameters[k]; 302 | 303 | if (IsCsIntrinsicType(csMethodParameter.Type.Name)) 304 | { 305 | mappedParameters.Add($"{intelMethodParameter.Name}.{CsTypeNameToEtype(csMethodParameter.Type.TypeParameter)}"); 306 | continue; 307 | } 308 | 309 | if (intelMethodParameter.Type.Name == csMethodParameter.Type.Name) 310 | { 311 | mappedParameters.Add($"{intelMethodParameter.Name}"); 312 | continue; 313 | } 314 | 315 | if (csMethodParameter.Type.IsPointer && !intelMethodParameter.Type.IsPointer) 316 | { 317 | mappedParameters.Add($"({csMethodParameter.Type})&{intelMethodParameter.Name}"); 318 | } 319 | else 320 | { 321 | mappedParameters.Add($"({csMethodParameter.Type}){intelMethodParameter.Name}"); 322 | } 323 | } 324 | 325 | var codeGenSb = new StringBuilder(); 326 | var tabOffset = "\t\t"; 327 | var returnCast = ""; 328 | if (!IsCsIntrinsicType(csMethod.ReturnType.Name) && csMethod.ReturnType.Name != intelMethod.Return.Type.Name) 329 | { 330 | returnCast = $"({intelMethod.Return.Type.ToRenderString()})"; 331 | } 332 | 333 | codeGenSb.AppendLine($"{tabOffset}/// "); 334 | codeGenSb.AppendLine($"{tabOffset}/// {intelMethod.Description}"); 335 | codeGenSb.AppendLine($"{tabOffset}/// "); 336 | codeGenSb.AppendLine($"{tabOffset}/// {intelMethod.Instructions}"); 337 | foreach (var intelMethodParameter in intelMethod.Parameters) 338 | { 339 | codeGenSb.AppendLine($"{tabOffset}/// {intelMethodParameter.Type.Name} {{{intelMethodParameter.Type.Hint}}}"); 340 | } 341 | 342 | codeGenSb.AppendLine($"{tabOffset}/// {intelMethod.Return.Type.Name} {intelMethod.Return.Name} {{{intelMethod.Return.Type.Hint}}}"); 343 | codeGenSb.AppendLine($"{tabOffset}public static {intelMethod.ToRenderString()} => {returnCast}{csMethod.ClassPath}.{csMethod.Name}({string.Join(", ", mappedParameters)});"); 344 | 345 | if (!outputData.ContainsKey(tech)) 346 | { 347 | outputData[tech] = new ConcurrentDictionary(); 348 | } 349 | outputData[tech][intelMethodName] = codeGenSb.ToString(); 350 | } 351 | } 352 | 353 | private static async Task FetchFileContent(string url) 354 | { 355 | var request = new HttpRequestMessage 356 | { 357 | Method = HttpMethod.Get, 358 | RequestUri = new Uri(url), 359 | }; 360 | var client = new HttpClient(); 361 | var result = await client.SendAsync(request); 362 | return await result.Content.ReadAsStringAsync(); 363 | } 364 | 365 | private static string CsTypeNameToEtype(string cst) 366 | { 367 | return cst switch 368 | { 369 | "byte" => "UI8", 370 | "sbyte" => "SI8", 371 | "ushort" => "UI16", 372 | "short" => "SI16", 373 | "uint" => "UI32", 374 | "int" => "SI32", 375 | "ulong" => "UI64", 376 | "long" => "SI64", 377 | "float" => "FP32", 378 | "double" => "FP64", 379 | _ => null 380 | }; 381 | } 382 | 383 | private static string EtypeToCsTypeName(string et) 384 | { 385 | return et switch 386 | { 387 | "UI8" => "byte", 388 | "SI8" => "sbyte", 389 | "UI16" => "ushort", 390 | "SI16" => "short", 391 | "UI32" => "uint", 392 | "SI32" => "int", 393 | "UI64" => "ulong", 394 | "SI64" => "long", 395 | "FP32" => "float", 396 | "FP64" => "double", 397 | _ => null 398 | }; 399 | } 400 | 401 | private static IntelType ParseIntelType(string type, string etype) 402 | { 403 | var match = IntelTypeDef.Match(type); 404 | if (!match.Success) 405 | { 406 | throw new InvalidOperationException($"Unknown Intel's type {type}"); 407 | } 408 | 409 | static string IntelTypeNameToSystemTypeName(string itn) 410 | { 411 | return itn switch 412 | { 413 | "__int8" or "char" or "__mmask8" => "byte", 414 | "__int16" or "__mmask16" => "short", 415 | "__int32" or "__mmask32" => "int", 416 | "__int64" or "long long" or "__mmask64" => "long", 417 | _ => itn 418 | }; 419 | } 420 | 421 | var isUnsigned = match.Groups["is_unsigned"].Success; 422 | var isPointer = match.Groups["is_ptr"].Success; 423 | var intelTypeName = IntelTypeNameToSystemTypeName(match.Groups["type_name"].Value); 424 | var csType = intelTypeName switch 425 | { 426 | "void" => new CsType 427 | { 428 | Name = "void", 429 | IsPointer = isPointer 430 | }, 431 | "byte" => new CsType 432 | { 433 | Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "byte" : "sbyte"), 434 | IsPointer = isPointer 435 | }, 436 | "short" => new CsType 437 | { 438 | Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "ushort" : "short"), 439 | IsPointer = isPointer 440 | }, 441 | "int" => new CsType 442 | { 443 | Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "uint" : "int"), 444 | IsPointer = isPointer 445 | }, 446 | "long" => new CsType 447 | { 448 | Name = EtypeToCsTypeName(etype) ?? (isUnsigned ? "ulong" : "long"), 449 | IsPointer = isPointer 450 | }, 451 | "float" => new CsType 452 | { 453 | Name = "float", 454 | IsPointer = isPointer 455 | }, 456 | "double" => new CsType 457 | { 458 | Name = "double", 459 | IsPointer = isPointer 460 | }, 461 | "__m64" or "__m64i" or "__m64d" => new CsType 462 | { 463 | Name = "Vector64", 464 | IsPointer = isPointer, 465 | TypeParameter = EtypeToCsTypeName(etype) 466 | }, 467 | "__m128" or "__m128i" or "__m128d" => new CsType 468 | { 469 | Name = "Vector128", 470 | IsPointer = isPointer, 471 | TypeParameter = EtypeToCsTypeName(etype) 472 | }, 473 | "__m256" or "__m256i" or "__m256d" => new CsType 474 | { 475 | Name = "Vector256", 476 | IsPointer = isPointer, 477 | TypeParameter = EtypeToCsTypeName(etype) 478 | }, 479 | _ => throw new InvalidOperationException($"No type matching Intel's {intelTypeName} found") 480 | }; 481 | return new IntelType 482 | { 483 | Name = intelTypeName, 484 | IsPointer = isPointer, 485 | Hint = etype, 486 | CsType = csType 487 | }; 488 | } 489 | 490 | private static bool IsCsIntrinsicType(string name) => name == nameof(Vector64) || name == nameof(Vector128) || name == nameof(Vector256); 491 | 492 | private static CsMethod FindMostSuited(IntelMethod intelMethod, List csMethods) 493 | { 494 | foreach (var csMethod in csMethods.Where(csMethod => csMethod.Parameters.Length > 0 && intelMethod.Parameters[0].Type.CsType.Name == csMethod.Parameters[0].Type.Name && intelMethod.Parameters[0].Type.CsType.TypeParameter == csMethod.Parameters[0].Type.TypeParameter)) 495 | { 496 | return csMethod; 497 | } 498 | 499 | return csMethods[0]; 500 | } 501 | 502 | private struct CsType 503 | { 504 | public string Name; 505 | public string TypeParameter; 506 | public bool IsPointer; 507 | 508 | public override string ToString() 509 | { 510 | var name = TypeParameter == null ? Name : $"{Name}<{TypeParameter}>"; 511 | return IsPointer ? $"{name}*" : $"{name}"; 512 | } 513 | } 514 | 515 | private struct CsMethod 516 | { 517 | public string ClassPath; 518 | public string Name; 519 | public CsType ReturnType; 520 | public CsMethodParam[] Parameters; 521 | public override string ToString() => $"{ReturnType} {Name}({string.Join(", ", Parameters)})"; 522 | } 523 | 524 | private struct CsMethodParam 525 | { 526 | public string Name; 527 | public CsType Type; 528 | public override string ToString() => $"{Type} {Name}"; 529 | } 530 | 531 | private struct IntelType 532 | { 533 | public string Name; 534 | public string Hint; 535 | public bool IsPointer; 536 | public CsType CsType; 537 | public string ToRenderString() => IsPointer ? $"{Name}*" : $"{Name}"; 538 | public override string ToString() => IsPointer ? $"{Name}* /* {Hint} */" : $"{Name} /* {Hint} */"; 539 | } 540 | 541 | private struct IntelMethod 542 | { 543 | public string Name; 544 | public IntelMethodParam Return; 545 | public IntelMethodParam[] Parameters; 546 | public string Description; 547 | public string Instructions; 548 | public string ToRenderString() => $"{Return.Type.ToRenderString()} {Name}({string.Join(", ", Parameters.Select(x => x.ToRenderString()))})"; 549 | public override string ToString() => $"{Return.Type} {Name}({string.Join(", ", Parameters)})"; 550 | } 551 | 552 | private struct IntelMethodParam 553 | { 554 | public string Name; 555 | public IntelType Type; 556 | public string ToRenderString() => $"{Type.ToRenderString()} {Name}"; 557 | public override string ToString() => $"{Type} {Name}"; 558 | } 559 | } 560 | } -------------------------------------------------------------------------------- /RawIntrinsics/SSE41.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class SSE41 4 | { 5 | /// 6 | /// Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". 7 | /// 8 | /// PBLENDW xmm, xmm, imm8 9 | /// __m128i {UI16} 10 | /// __m128i {UI16} 11 | /// int {IMM} 12 | /// __m128i dst {UI16} 13 | public static __m128i _mm_blend_epi16(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Blend(a.UI16, b.UI16, (byte)imm8); 14 | 15 | /// 16 | /// Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". 17 | /// 18 | /// BLENDPD xmm, xmm, imm8 19 | /// __m128d {FP64} 20 | /// __m128d {FP64} 21 | /// int {IMM} 22 | /// __m128d dst {FP64} 23 | public static __m128d _mm_blend_pd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Blend(a.FP64, b.FP64, (byte)imm8); 24 | 25 | /// 26 | /// Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". 27 | /// 28 | /// BLENDPS xmm, xmm, imm8 29 | /// __m128 {FP32} 30 | /// __m128 {FP32} 31 | /// int {IMM} 32 | /// __m128 dst {FP32} 33 | public static __m128 _mm_blend_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Blend(a.FP32, b.FP32, (byte)imm8); 34 | 35 | /// 36 | /// Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". 37 | /// 38 | /// PBLENDVB xmm, xmm 39 | /// __m128i {UI8} 40 | /// __m128i {UI8} 41 | /// __m128i {UI8} 42 | /// __m128i dst {UI8} 43 | public static __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i mask) => System.Runtime.Intrinsics.X86.Sse41.BlendVariable(a.UI8, b.UI8, mask.UI8); 44 | 45 | /// 46 | /// Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". 47 | /// 48 | /// BLENDVPD xmm, xmm 49 | /// __m128d {FP64} 50 | /// __m128d {FP64} 51 | /// __m128d {FP64} 52 | /// __m128d dst {FP64} 53 | public static __m128d _mm_blendv_pd(__m128d a, __m128d b, __m128d mask) => System.Runtime.Intrinsics.X86.Sse41.BlendVariable(a.FP64, b.FP64, mask.FP64); 54 | 55 | /// 56 | /// Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". 57 | /// 58 | /// BLENDVPS xmm, xmm 59 | /// __m128 {FP32} 60 | /// __m128 {FP32} 61 | /// __m128 {FP32} 62 | /// __m128 dst {FP32} 63 | public static __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask) => System.Runtime.Intrinsics.X86.Sse41.BlendVariable(a.FP32, b.FP32, mask.FP32); 64 | 65 | /// 66 | /// Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". 67 | /// 68 | /// ROUNDPD xmm, xmm, imm8 69 | /// __m128d {FP64} 70 | /// __m128d dst {FP64} 71 | public static __m128d _mm_ceil_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse41.Ceiling(a.FP64); 72 | 73 | /// 74 | /// Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". 75 | /// 76 | /// ROUNDPS xmm, xmm, imm8 77 | /// __m128 {FP32} 78 | /// __m128 dst {FP32} 79 | public static __m128 _mm_ceil_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse41.Ceiling(a.FP32); 80 | 81 | /// 82 | /// Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 83 | /// 84 | /// ROUNDSD xmm, xmm, imm8 85 | /// __m128d {FP64} 86 | /// __m128d {FP64} 87 | /// __m128d dst {FP64} 88 | public static __m128d _mm_ceil_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse41.CeilingScalar(a.FP64); 89 | 90 | /// 91 | /// Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 92 | /// 93 | /// ROUNDSS xmm, xmm, imm8 94 | /// __m128 {FP32} 95 | /// __m128 {FP32} 96 | /// __m128 dst {FP32} 97 | public static __m128 _mm_ceil_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse41.CeilingScalar(a.FP32); 98 | 99 | /// 100 | /// Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". 101 | /// 102 | /// PCMPEQQ xmm, xmm 103 | /// __m128i {UI64} 104 | /// __m128i {UI64} 105 | /// __m128i dst {UI64} 106 | public static __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.CompareEqual(a.UI64, b.UI64); 107 | 108 | /// 109 | /// Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". 110 | /// 111 | /// PMOVSXWD xmm, xmm 112 | /// __m128i {SI16} 113 | /// __m128i dst {SI32} 114 | public static __m128i _mm_cvtepi16_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.SI16); 115 | 116 | /// 117 | /// Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". 118 | /// 119 | /// PMOVSXWQ xmm, xmm 120 | /// __m128i {SI16} 121 | /// __m128i dst {SI64} 122 | public static __m128i _mm_cvtepi16_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.SI16); 123 | 124 | /// 125 | /// Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". 126 | /// 127 | /// PMOVSXDQ xmm, xmm 128 | /// __m128i {SI32} 129 | /// __m128i dst {SI64} 130 | public static __m128i _mm_cvtepi32_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.SI32); 131 | 132 | /// 133 | /// Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". 134 | /// 135 | /// PMOVSXBW xmm, xmm 136 | /// __m128i {SI8} 137 | /// __m128i dst {SI16} 138 | public static __m128i _mm_cvtepi8_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int16(a.SI8); 139 | 140 | /// 141 | /// Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". 142 | /// 143 | /// PMOVSXBD xmm, xmm 144 | /// __m128i {SI8} 145 | /// __m128i dst {SI32} 146 | public static __m128i _mm_cvtepi8_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.SI8); 147 | 148 | /// 149 | /// Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". 150 | /// 151 | /// PMOVSXBQ xmm, xmm 152 | /// __m128i {SI8} 153 | /// __m128i dst {SI64} 154 | public static __m128i _mm_cvtepi8_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.SI8); 155 | 156 | /// 157 | /// Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". 158 | /// 159 | /// PMOVZXWD xmm, xmm 160 | /// __m128i {UI16} 161 | /// __m128i dst {UI32} 162 | public static __m128i _mm_cvtepu16_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.UI16); 163 | 164 | /// 165 | /// Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". 166 | /// 167 | /// PMOVZXWQ xmm, xmm 168 | /// __m128i {UI16} 169 | /// __m128i dst {UI64} 170 | public static __m128i _mm_cvtepu16_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.UI16); 171 | 172 | /// 173 | /// Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". 174 | /// 175 | /// PMOVZXDQ xmm, xmm 176 | /// __m128i {UI32} 177 | /// __m128i dst {UI64} 178 | public static __m128i _mm_cvtepu32_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.UI32); 179 | 180 | /// 181 | /// Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". 182 | /// 183 | /// PMOVZXBW xmm, xmm 184 | /// __m128i {UI8} 185 | /// __m128i dst {UI16} 186 | public static __m128i _mm_cvtepu8_epi16(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int16(a.UI8); 187 | 188 | /// 189 | /// Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". 190 | /// 191 | /// PMOVZXBD xmm, xmm 192 | /// __m128i {UI8} 193 | /// __m128i dst {UI32} 194 | public static __m128i _mm_cvtepu8_epi32(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int32(a.UI8); 195 | 196 | /// 197 | /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". 198 | /// 199 | /// PMOVZXBQ xmm, xmm 200 | /// __m128i {UI8} 201 | /// __m128i dst {UI64} 202 | public static __m128i _mm_cvtepu8_epi64(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.ConvertToVector128Int64(a.UI8); 203 | 204 | /// 205 | /// Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". 206 | /// 207 | /// DPPD xmm, xmm, imm8 208 | /// __m128d {FP64} 209 | /// __m128d {FP64} 210 | /// int {IMM} 211 | /// __m128d dst {FP64} 212 | public static __m128d _mm_dp_pd(__m128d a, __m128d b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.DotProduct(a.FP64, b.FP64, (byte)imm8); 213 | 214 | /// 215 | /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". 216 | /// 217 | /// DPPS xmm, xmm, imm8 218 | /// __m128 {FP32} 219 | /// __m128 {FP32} 220 | /// int {IMM} 221 | /// __m128 dst {FP32} 222 | public static __m128 _mm_dp_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.DotProduct(a.FP32, b.FP32, (byte)imm8); 223 | 224 | /// 225 | /// Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". 226 | /// 227 | /// PEXTRD r32, xmm, imm8 228 | /// __m128i {UI32} 229 | /// int {IMM} 230 | /// int dst {UI32} 231 | public static int _mm_extract_epi32(__m128i a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse41.Extract(a.UI32, (byte)imm8); 232 | 233 | /// 234 | /// Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". 235 | /// 236 | /// PEXTRQ r64, xmm, imm8 237 | /// __m128i {UI64} 238 | /// int {IMM} 239 | /// long dst {UI64} 240 | public static long _mm_extract_epi64(__m128i a, int imm8) => (long)System.Runtime.Intrinsics.X86.Sse41.X64.Extract(a.UI64, (byte)imm8); 241 | 242 | /// 243 | /// Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". 244 | /// 245 | /// PEXTRB r32, xmm, imm8 246 | /// __m128i {UI8} 247 | /// int {IMM} 248 | /// int dst {UI8} 249 | public static int _mm_extract_epi8(__m128i a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse41.Extract(a.UI8, (byte)imm8); 250 | 251 | /// 252 | /// Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". 253 | /// 254 | /// EXTRACTPS r32, xmm, imm8 255 | /// __m128 {FP32} 256 | /// int {IMM} 257 | /// int dst {UI32} 258 | public static int _mm_extract_ps(__m128 a, int imm8) => (int)System.Runtime.Intrinsics.X86.Sse41.Extract(a.FP32, (byte)imm8); 259 | 260 | /// 261 | /// Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". 262 | /// 263 | /// ROUNDPD xmm, xmm, imm8 264 | /// __m128d {FP64} 265 | /// __m128d dst {FP64} 266 | public static __m128d _mm_floor_pd(__m128d a) => System.Runtime.Intrinsics.X86.Sse41.Floor(a.FP64); 267 | 268 | /// 269 | /// Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". 270 | /// 271 | /// ROUNDPS xmm, xmm, imm8 272 | /// __m128 {FP32} 273 | /// __m128 dst {FP32} 274 | public static __m128 _mm_floor_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse41.Floor(a.FP32); 275 | 276 | /// 277 | /// Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 278 | /// 279 | /// ROUNDSD xmm, xmm, imm8 280 | /// __m128d {FP64} 281 | /// __m128d {FP64} 282 | /// __m128d dst {FP64} 283 | public static __m128d _mm_floor_sd(__m128d a, __m128d b) => System.Runtime.Intrinsics.X86.Sse41.FloorScalar(a.FP64); 284 | 285 | /// 286 | /// Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 287 | /// 288 | /// ROUNDSS xmm, xmm, imm8 289 | /// __m128 {FP32} 290 | /// __m128 {FP32} 291 | /// __m128 dst {FP32} 292 | public static __m128 _mm_floor_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse41.FloorScalar(a.FP32); 293 | 294 | /// 295 | /// Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". 296 | /// 297 | /// PINSRD xmm, r32, imm8 298 | /// __m128i {UI32} 299 | /// int {UI32} 300 | /// int {IMM} 301 | /// __m128i dst {UI32} 302 | public static __m128i _mm_insert_epi32(__m128i a, int i, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Insert(a.UI32, (uint)i, (byte)imm8); 303 | 304 | /// 305 | /// Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". 306 | /// 307 | /// PINSRQ xmm, r64, imm8 308 | /// __m128i {UI64} 309 | /// long {UI64} 310 | /// int {IMM} 311 | /// __m128i dst {UI64} 312 | public static __m128i _mm_insert_epi64(__m128i a, long i, int imm8) => System.Runtime.Intrinsics.X86.Sse41.X64.Insert(a.UI64, (ulong)i, (byte)imm8); 313 | 314 | /// 315 | /// Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". 316 | /// 317 | /// PINSRB xmm, r32, imm8 318 | /// __m128i {UI8} 319 | /// int {UI8} 320 | /// int {IMM} 321 | /// __m128i dst {UI8} 322 | public static __m128i _mm_insert_epi8(__m128i a, int i, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Insert(a.UI8, (byte)i, (byte)imm8); 323 | 324 | /// 325 | /// Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). 326 | /// 327 | /// INSERTPS xmm, xmm, imm8 328 | /// __m128 {FP32} 329 | /// __m128 {FP32} 330 | /// int {IMM} 331 | /// __m128 dst {FP32} 332 | public static __m128 _mm_insert_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.Insert(a.FP32, b.FP32, (byte)imm8); 333 | 334 | /// 335 | /// Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". 336 | /// 337 | /// PMAXSD xmm, xmm 338 | /// __m128i {SI32} 339 | /// __m128i {SI32} 340 | /// __m128i dst {UI32} 341 | public static __m128i _mm_max_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.SI32, b.SI32); 342 | 343 | /// 344 | /// Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". 345 | /// 346 | /// PMAXSB xmm, xmm 347 | /// __m128i {SI8} 348 | /// __m128i {SI8} 349 | /// __m128i dst {UI8} 350 | public static __m128i _mm_max_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.SI8, b.SI8); 351 | 352 | /// 353 | /// Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". 354 | /// 355 | /// PMAXUW xmm, xmm 356 | /// __m128i {UI16} 357 | /// __m128i {UI16} 358 | /// __m128i dst {UI16} 359 | public static __m128i _mm_max_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.UI16, b.UI16); 360 | 361 | /// 362 | /// Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". 363 | /// 364 | /// PMAXUD xmm, xmm 365 | /// __m128i {UI32} 366 | /// __m128i {UI32} 367 | /// __m128i dst {UI32} 368 | public static __m128i _mm_max_epu32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Max(a.UI32, b.UI32); 369 | 370 | /// 371 | /// Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". 372 | /// 373 | /// PMINSD xmm, xmm 374 | /// __m128i {SI32} 375 | /// __m128i {SI32} 376 | /// __m128i dst {UI32} 377 | public static __m128i _mm_min_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.SI32, b.SI32); 378 | 379 | /// 380 | /// Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". 381 | /// 382 | /// PMINSB xmm, xmm 383 | /// __m128i {SI8} 384 | /// __m128i {SI8} 385 | /// __m128i dst {UI8} 386 | public static __m128i _mm_min_epi8(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.SI8, b.SI8); 387 | 388 | /// 389 | /// Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". 390 | /// 391 | /// PMINUW xmm, xmm 392 | /// __m128i {UI16} 393 | /// __m128i {UI16} 394 | /// __m128i dst {UI16} 395 | public static __m128i _mm_min_epu16(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.UI16, b.UI16); 396 | 397 | /// 398 | /// Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". 399 | /// 400 | /// PMINUD xmm, xmm 401 | /// __m128i {UI32} 402 | /// __m128i {UI32} 403 | /// __m128i dst {UI32} 404 | public static __m128i _mm_min_epu32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Min(a.UI32, b.UI32); 405 | 406 | /// 407 | /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst". 408 | /// 409 | /// PHMINPOSUW xmm, xmm 410 | /// __m128i {UI16} 411 | /// __m128i dst {UI16} 412 | public static __m128i _mm_minpos_epu16(__m128i a) => System.Runtime.Intrinsics.X86.Sse41.MinHorizontal(a.UI16); 413 | 414 | /// 415 | /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". 416 | /// 417 | /// MPSADBW xmm, xmm, imm8 418 | /// __m128i {UI8} 419 | /// __m128i {UI8} 420 | /// int {IMM} 421 | /// __m128i dst {UI8} 422 | public static __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, int imm8) => System.Runtime.Intrinsics.X86.Sse41.MultipleSumAbsoluteDifferences(a.UI8, b.UI8, (byte)imm8); 423 | 424 | /// 425 | /// Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". 426 | /// 427 | /// PMULDQ xmm, xmm 428 | /// __m128i {SI32} 429 | /// __m128i {SI32} 430 | /// __m128i dst {SI64} 431 | public static __m128i _mm_mul_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.Multiply(a.SI32, b.SI32); 432 | 433 | /// 434 | /// Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". 435 | /// 436 | /// PMULLD xmm, xmm 437 | /// __m128i {UI32} 438 | /// __m128i {UI32} 439 | /// __m128i dst {UI32} 440 | public static __m128i _mm_mullo_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.MultiplyLow(a.UI32, b.UI32); 441 | 442 | /// 443 | /// Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". 444 | /// 445 | /// PACKUSDW xmm, xmm 446 | /// __m128i {SI32} 447 | /// __m128i {SI32} 448 | /// __m128i dst {UI16} 449 | public static __m128i _mm_packus_epi32(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.PackUnsignedSaturate(a.SI32, b.SI32); 450 | 451 | /// 452 | /// Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". [round_note] 453 | /// 454 | /// ROUNDPD xmm, xmm, imm8 455 | /// __m128d {FP64} 456 | /// int {IMM} 457 | /// __m128d dst {FP64} 458 | public static __m128d _mm_round_pd(__m128d a, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundToNearestInteger(a.FP64); 459 | 460 | /// 461 | /// Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". [round_note] 462 | /// 463 | /// ROUNDPS xmm, xmm, imm8 464 | /// __m128 {FP32} 465 | /// int {IMM} 466 | /// __m128 dst {FP32} 467 | public static __m128 _mm_round_ps(__m128 a, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundToNearestInteger(a.FP32); 468 | 469 | /// 470 | /// Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_note] 471 | /// 472 | /// ROUNDSD xmm, xmm, imm8 473 | /// __m128d {FP64} 474 | /// __m128d {FP64} 475 | /// int {IMM} 476 | /// __m128d dst {FP64} 477 | public static __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundCurrentDirectionScalar(a.FP64); 478 | 479 | /// 480 | /// Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note] 481 | /// 482 | /// ROUNDSS xmm, xmm, imm8 483 | /// __m128 {FP32} 484 | /// __m128 {FP32} 485 | /// int {IMM} 486 | /// __m128 dst {FP32} 487 | public static __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) => System.Runtime.Intrinsics.X86.Sse41.RoundCurrentDirectionScalar(a.FP32); 488 | 489 | /// 490 | /// Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. 491 | /// 492 | /// MOVNTDQA xmm, m128 493 | /// __m128i {M128} 494 | /// __m128i dst {M128} 495 | public static __m128i _mm_stream_load_si128(__m128i* mem_addr) => System.Runtime.Intrinsics.X86.Sse41.LoadAlignedVector128NonTemporal((sbyte*)mem_addr); 496 | 497 | /// 498 | /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. 499 | /// 500 | /// PTEST xmm, xmm 501 | /// __m128i {M128} 502 | /// __m128i {M128} 503 | /// bool k {UI8} 504 | public static bool _mm_testc_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.TestC(a.SI8, b.SI8); 505 | 506 | /// 507 | /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. 508 | /// 509 | /// PTEST xmm, xmm 510 | /// __m128i {M128} 511 | /// __m128i {M128} 512 | /// bool dst {UI8} 513 | public static bool _mm_testnzc_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.TestNotZAndNotC(a.SI8, b.SI8); 514 | 515 | /// 516 | /// Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. 517 | /// 518 | /// PTEST xmm, xmm 519 | /// __m128i {M128} 520 | /// __m128i {M128} 521 | /// bool k {UI8} 522 | public static bool _mm_testz_si128(__m128i a, __m128i b) => System.Runtime.Intrinsics.X86.Sse41.TestZ(a.SI8, b.SI8); 523 | 524 | } 525 | } 526 | -------------------------------------------------------------------------------- /RawIntrinsics/SSE.cs: -------------------------------------------------------------------------------- 1 | namespace RawIntrinsics 2 | { 3 | public static unsafe partial class SSE 4 | { 5 | /// 6 | /// Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 7 | /// 8 | /// ADDPS xmm, xmm 9 | /// __m128 {FP32} 10 | /// __m128 {FP32} 11 | /// __m128 dst {FP32} 12 | public static __m128 _mm_add_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Add(a.FP32, b.FP32); 13 | 14 | /// 15 | /// Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 16 | /// 17 | /// ADDSS xmm, xmm 18 | /// __m128 {FP32} 19 | /// __m128 {FP32} 20 | /// __m128 dst {FP32} 21 | public static __m128 _mm_add_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.AddScalar(a.FP32, b.FP32); 22 | 23 | /// 24 | /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 25 | /// 26 | /// ANDPS xmm, xmm 27 | /// __m128 {FP32} 28 | /// __m128 {FP32} 29 | /// __m128 dst {FP32} 30 | public static __m128 _mm_and_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.And(a.FP32, b.FP32); 31 | 32 | /// 33 | /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". 34 | /// 35 | /// ANDNPS xmm, xmm 36 | /// __m128 {FP32} 37 | /// __m128 {FP32} 38 | /// __m128 dst {FP32} 39 | public static __m128 _mm_andnot_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.AndNot(a.FP32, b.FP32); 40 | 41 | /// 42 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". 43 | /// 44 | /// CMPPS xmm, xmm, imm8 45 | /// __m128 {FP32} 46 | /// __m128 {FP32} 47 | /// __m128 dst {FP32} 48 | public static __m128 _mm_cmpeq_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareEqual(a.FP32, b.FP32); 49 | 50 | /// 51 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 52 | /// 53 | /// CMPSS xmm, xmm, imm8 54 | /// __m128 {FP32} 55 | /// __m128 {FP32} 56 | /// __m128 dst {FP32} 57 | public static __m128 _mm_cmpeq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarEqual(a.FP32, b.FP32); 58 | 59 | /// 60 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". 61 | /// 62 | /// CMPPS xmm, xmm, imm8 63 | /// __m128 {FP32} 64 | /// __m128 {FP32} 65 | /// __m128 dst {FP32} 66 | public static __m128 _mm_cmpge_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareGreaterThanOrEqual(a.FP32, b.FP32); 67 | 68 | /// 69 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 70 | /// 71 | /// CMPSS xmm, xmm, imm8 72 | /// __m128 {FP32} 73 | /// __m128 {FP32} 74 | /// __m128 dst {FP32} 75 | public static __m128 _mm_cmpge_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarGreaterThanOrEqual(a.FP32, b.FP32); 76 | 77 | /// 78 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". 79 | /// 80 | /// CMPPS xmm, xmm, imm8 81 | /// __m128 {FP32} 82 | /// __m128 {FP32} 83 | /// __m128 dst {FP32} 84 | public static __m128 _mm_cmpgt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareGreaterThan(a.FP32, b.FP32); 85 | 86 | /// 87 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 88 | /// 89 | /// CMPSS xmm, xmm, imm8 90 | /// __m128 {FP32} 91 | /// __m128 {FP32} 92 | /// __m128 dst {FP32} 93 | public static __m128 _mm_cmpgt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarGreaterThan(a.FP32, b.FP32); 94 | 95 | /// 96 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". 97 | /// 98 | /// CMPPS xmm, xmm, imm8 99 | /// __m128 {FP32} 100 | /// __m128 {FP32} 101 | /// __m128 dst {FP32} 102 | public static __m128 _mm_cmple_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareLessThanOrEqual(a.FP32, b.FP32); 103 | 104 | /// 105 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 106 | /// 107 | /// CMPSS xmm, xmm, imm8 108 | /// __m128 {FP32} 109 | /// __m128 {FP32} 110 | /// __m128 dst {FP32} 111 | public static __m128 _mm_cmple_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarLessThanOrEqual(a.FP32, b.FP32); 112 | 113 | /// 114 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". 115 | /// 116 | /// CMPPS xmm, xmm, imm8 117 | /// __m128 {FP32} 118 | /// __m128 {FP32} 119 | /// __m128 dst {FP32} 120 | public static __m128 _mm_cmplt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareLessThan(a.FP32, b.FP32); 121 | 122 | /// 123 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 124 | /// 125 | /// CMPSS xmm, xmm, imm8 126 | /// __m128 {FP32} 127 | /// __m128 {FP32} 128 | /// __m128 dst {FP32} 129 | public static __m128 _mm_cmplt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarLessThan(a.FP32, b.FP32); 130 | 131 | /// 132 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". 133 | /// 134 | /// CMPPS xmm, xmm, imm8 135 | /// __m128 {FP32} 136 | /// __m128 {FP32} 137 | /// __m128 dst {FP32} 138 | public static __m128 _mm_cmpneq_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotEqual(a.FP32, b.FP32); 139 | 140 | /// 141 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 142 | /// 143 | /// CMPSS xmm, xmm, imm8 144 | /// __m128 {FP32} 145 | /// __m128 {FP32} 146 | /// __m128 dst {FP32} 147 | public static __m128 _mm_cmpneq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotEqual(a.FP32, b.FP32); 148 | 149 | /// 150 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". 151 | /// 152 | /// CMPPS xmm, xmm, imm8 153 | /// __m128 {FP32} 154 | /// __m128 {FP32} 155 | /// __m128 dst {FP32} 156 | public static __m128 _mm_cmpnge_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotGreaterThanOrEqual(a.FP32, b.FP32); 157 | 158 | /// 159 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 160 | /// 161 | /// CMPSS xmm, xmm, imm8 162 | /// __m128 {FP32} 163 | /// __m128 {FP32} 164 | /// __m128 dst {FP32} 165 | public static __m128 _mm_cmpnge_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotGreaterThanOrEqual(a.FP32, b.FP32); 166 | 167 | /// 168 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". 169 | /// 170 | /// CMPPS xmm, xmm, imm8 171 | /// __m128 {FP32} 172 | /// __m128 {FP32} 173 | /// __m128 dst {FP32} 174 | public static __m128 _mm_cmpngt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotGreaterThan(a.FP32, b.FP32); 175 | 176 | /// 177 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 178 | /// 179 | /// CMPSS xmm, xmm, imm8 180 | /// __m128 {FP32} 181 | /// __m128 {FP32} 182 | /// __m128 dst {FP32} 183 | public static __m128 _mm_cmpngt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotGreaterThan(a.FP32, b.FP32); 184 | 185 | /// 186 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". 187 | /// 188 | /// CMPPS xmm, xmm, imm8 189 | /// __m128 {FP32} 190 | /// __m128 {FP32} 191 | /// __m128 dst {FP32} 192 | public static __m128 _mm_cmpnle_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotLessThanOrEqual(a.FP32, b.FP32); 193 | 194 | /// 195 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 196 | /// 197 | /// CMPSS xmm, xmm, imm8 198 | /// __m128 {FP32} 199 | /// __m128 {FP32} 200 | /// __m128 dst {FP32} 201 | public static __m128 _mm_cmpnle_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotLessThanOrEqual(a.FP32, b.FP32); 202 | 203 | /// 204 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". 205 | /// 206 | /// CMPPS xmm, xmm, imm8 207 | /// __m128 {FP32} 208 | /// __m128 {FP32} 209 | /// __m128 dst {FP32} 210 | public static __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareNotLessThan(a.FP32, b.FP32); 211 | 212 | /// 213 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 214 | /// 215 | /// CMPSS xmm, xmm, imm8 216 | /// __m128 {FP32} 217 | /// __m128 {FP32} 218 | /// __m128 dst {FP32} 219 | public static __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarNotLessThan(a.FP32, b.FP32); 220 | 221 | /// 222 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". 223 | /// 224 | /// CMPPS xmm, xmm, imm8 225 | /// __m128 {FP32} 226 | /// __m128 {FP32} 227 | /// __m128 dst {FP32} 228 | public static __m128 _mm_cmpord_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareOrdered(a.FP32, b.FP32); 229 | 230 | /// 231 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 232 | /// 233 | /// CMPSS xmm, xmm, imm8 234 | /// __m128 {FP32} 235 | /// __m128 {FP32} 236 | /// __m128 dst {FP32} 237 | public static __m128 _mm_cmpord_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrdered(a.FP32, b.FP32); 238 | 239 | /// 240 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". 241 | /// 242 | /// CMPPS xmm, xmm, imm8 243 | /// __m128 {FP32} 244 | /// __m128 {FP32} 245 | /// __m128 dst {FP32} 246 | public static __m128 _mm_cmpunord_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareUnordered(a.FP32, b.FP32); 247 | 248 | /// 249 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 250 | /// 251 | /// CMPSS xmm, xmm, imm8 252 | /// __m128 {FP32} 253 | /// __m128 {FP32} 254 | /// __m128 dst {FP32} 255 | public static __m128 _mm_cmpunord_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnordered(a.FP32, b.FP32); 256 | 257 | /// 258 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). 259 | /// 260 | /// COMISS xmm, xmm 261 | /// __m128 {FP32} 262 | /// __m128 {FP32} 263 | /// bool k {UI8} 264 | public static bool _mm_comieq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedEqual(a.FP32, b.FP32); 265 | 266 | /// 267 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). 268 | /// 269 | /// COMISS xmm, xmm 270 | /// __m128 {FP32} 271 | /// __m128 {FP32} 272 | /// bool k {UI8} 273 | public static bool _mm_comige_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedGreaterThanOrEqual(a.FP32, b.FP32); 274 | 275 | /// 276 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). 277 | /// 278 | /// COMISS xmm, xmm 279 | /// __m128 {FP32} 280 | /// __m128 {FP32} 281 | /// bool k {UI8} 282 | public static bool _mm_comigt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedGreaterThan(a.FP32, b.FP32); 283 | 284 | /// 285 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). 286 | /// 287 | /// COMISS xmm, xmm 288 | /// __m128 {FP32} 289 | /// __m128 {FP32} 290 | /// bool k {UI8} 291 | public static bool _mm_comile_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedLessThanOrEqual(a.FP32, b.FP32); 292 | 293 | /// 294 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). 295 | /// 296 | /// COMISS xmm, xmm 297 | /// __m128 {FP32} 298 | /// __m128 {FP32} 299 | /// bool k {UI8} 300 | public static bool _mm_comilt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedLessThan(a.FP32, b.FP32); 301 | 302 | /// 303 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). 304 | /// 305 | /// COMISS xmm, xmm 306 | /// __m128 {FP32} 307 | /// __m128 {FP32} 308 | /// bool k {UI8} 309 | public static bool _mm_comineq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarOrderedNotEqual(a.FP32, b.FP32); 310 | 311 | /// 312 | /// Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 313 | /// 314 | /// CVTSI2SS xmm, r32 315 | /// __m128 {FP32} 316 | /// int {SI32} 317 | /// __m128 dst {FP32} 318 | public static __m128 _mm_cvtsi32_ss(__m128 a, int b) => System.Runtime.Intrinsics.X86.Sse.ConvertScalarToVector128Single(a.FP32, b); 319 | 320 | /// 321 | /// Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 322 | /// 323 | /// CVTSI2SS xmm, r64 324 | /// __m128 {FP32} 325 | /// long {SI64} 326 | /// __m128 dst {FP32} 327 | public static __m128 _mm_cvtsi64_ss(__m128 a, long b) => System.Runtime.Intrinsics.X86.Sse.X64.ConvertScalarToVector128Single(a.FP32, b); 328 | 329 | /// 330 | /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". 331 | /// 332 | /// CVTSS2SI r32, xmm 333 | /// __m128 {FP32} 334 | /// int dst {UI32} 335 | public static int _mm_cvtss_si32(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ConvertToInt32(a.FP32); 336 | 337 | /// 338 | /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". 339 | /// 340 | /// CVTSS2SI r64, xmm 341 | /// __m128 {FP32} 342 | /// long dst {UI64} 343 | public static long _mm_cvtss_si64(__m128 a) => System.Runtime.Intrinsics.X86.Sse.X64.ConvertToInt64(a.FP32); 344 | 345 | /// 346 | /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". 347 | /// 348 | /// CVTTSS2SI r32, xmm 349 | /// __m128 {FP32} 350 | /// int dst {UI32} 351 | public static int _mm_cvttss_si32(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ConvertToInt32WithTruncation(a.FP32); 352 | 353 | /// 354 | /// Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". 355 | /// 356 | /// CVTTSS2SI r64, xmm 357 | /// __m128 {FP32} 358 | /// long dst {UI64} 359 | public static long _mm_cvttss_si64(__m128 a) => System.Runtime.Intrinsics.X86.Sse.X64.ConvertToInt64WithTruncation(a.FP32); 360 | 361 | /// 362 | /// Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". 363 | /// 364 | /// DIVPS xmm, xmm 365 | /// __m128 {FP32} 366 | /// __m128 {FP32} 367 | /// __m128 dst {FP32} 368 | public static __m128 _mm_div_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Divide(a.FP32, b.FP32); 369 | 370 | /// 371 | /// Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 372 | /// 373 | /// DIVSS xmm, xmm 374 | /// __m128 {FP32} 375 | /// __m128 {FP32} 376 | /// __m128 dst {FP32} 377 | public static __m128 _mm_div_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.DivideScalar(a.FP32, b.FP32); 378 | 379 | /// 380 | /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. 381 | /// 382 | /// MOVAPS xmm, m128 383 | /// float {FP32} 384 | /// __m128 dst {FP32} 385 | public static __m128 _mm_load_ps(float* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadAlignedVector128(mem_addr); 386 | 387 | /// 388 | /// Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary. 389 | /// 390 | /// MOVSS xmm, m32 391 | /// float {FP32} 392 | /// __m128 dst {FP32} 393 | public static __m128 _mm_load_ss(float* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadScalarVector128(mem_addr); 394 | 395 | /// 396 | /// Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. 397 | /// 398 | /// MOVHPS xmm, m64 399 | /// __m128 {FP32} 400 | /// __m64 {FP32} 401 | /// __m128 dst {FP32} 402 | public static __m128 _mm_loadh_pi(__m128 a, __m64* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadHigh(a.FP32, (float*)mem_addr); 403 | 404 | /// 405 | /// Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. 406 | /// 407 | /// MOVLPS xmm, m64 408 | /// __m128 {FP32} 409 | /// __m64 {FP32} 410 | /// __m128 dst {FP32} 411 | public static __m128 _mm_loadl_pi(__m128 a, __m64* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadLow(a.FP32, (float*)mem_addr); 412 | 413 | /// 414 | /// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". "mem_addr" does not need to be aligned on any particular boundary. 415 | /// 416 | /// MOVUPS xmm, m128 417 | /// float {FP32} 418 | /// __m128 dst {FP32} 419 | public static __m128 _mm_loadu_ps(float* mem_addr) => System.Runtime.Intrinsics.X86.Sse.LoadVector128(mem_addr); 420 | 421 | /// 422 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". 423 | /// 424 | /// MAXPS xmm, xmm 425 | /// __m128 {FP32} 426 | /// __m128 {FP32} 427 | /// __m128 dst {FP32} 428 | public static __m128 _mm_max_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Max(a.FP32, b.FP32); 429 | 430 | /// 431 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". 432 | /// 433 | /// MAXSS xmm, xmm 434 | /// __m128 {FP32} 435 | /// __m128 {FP32} 436 | /// __m128 dst {FP32} 437 | public static __m128 _mm_max_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MaxScalar(a.FP32, b.FP32); 438 | 439 | /// 440 | /// Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". 441 | /// 442 | /// MINPS xmm, xmm 443 | /// __m128 {FP32} 444 | /// __m128 {FP32} 445 | /// __m128 dst {FP32} 446 | public static __m128 _mm_min_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Min(a.FP32, b.FP32); 447 | 448 | /// 449 | /// Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". 450 | /// 451 | /// MINSS xmm, xmm 452 | /// __m128 {FP32} 453 | /// __m128 {FP32} 454 | /// __m128 dst {FP32} 455 | public static __m128 _mm_min_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MinScalar(a.FP32, b.FP32); 456 | 457 | /// 458 | /// Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 459 | /// 460 | /// MOVSS xmm, xmm 461 | /// __m128 {FP32} 462 | /// __m128 {FP32} 463 | /// __m128 dst {FP32} 464 | public static __m128 _mm_move_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MoveScalar(a.FP32, b.FP32); 465 | 466 | /// 467 | /// Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst". 468 | /// 469 | /// MOVHLPS xmm, xmm 470 | /// __m128 {FP32} 471 | /// __m128 {FP32} 472 | /// __m128 dst {FP32} 473 | public static __m128 _mm_movehl_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MoveHighToLow(a.FP32, b.FP32); 474 | 475 | /// 476 | /// Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst". 477 | /// 478 | /// MOVLHPS xmm, xmm 479 | /// __m128 {FP32} 480 | /// __m128 {FP32} 481 | /// __m128 dst {FP32} 482 | public static __m128 _mm_movelh_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MoveLowToHigh(a.FP32, b.FP32); 483 | 484 | /// 485 | /// Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". 486 | /// 487 | /// MOVMSKPS r32, xmm 488 | /// __m128 {FP32} 489 | /// int dst {UI32} 490 | public static int _mm_movemask_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.MoveMask(a.FP32); 491 | 492 | /// 493 | /// Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 494 | /// 495 | /// MULPS xmm, xmm 496 | /// __m128 {FP32} 497 | /// __m128 {FP32} 498 | /// __m128 dst {FP32} 499 | public static __m128 _mm_mul_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Multiply(a.FP32, b.FP32); 500 | 501 | /// 502 | /// Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 503 | /// 504 | /// MULSS xmm, xmm 505 | /// __m128 {FP32} 506 | /// __m128 {FP32} 507 | /// __m128 dst {FP32} 508 | public static __m128 _mm_mul_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.MultiplyScalar(a.FP32, b.FP32); 509 | 510 | /// 511 | /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 512 | /// 513 | /// ORPS xmm, xmm 514 | /// __m128 {FP32} 515 | /// __m128 {FP32} 516 | /// __m128 dst {FP32} 517 | public static __m128 _mm_or_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Or(a.FP32, b.FP32); 518 | 519 | /// 520 | /// Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i". 521 | /// 522 | /// PREFETCHNTA m8 523 | /// byte {UI8} 524 | /// int {IMM} 525 | /// void {} 526 | public static void _mm_prefetch(byte* p, int i) => System.Runtime.Intrinsics.X86.Sse.Prefetch0((void*)p); 527 | 528 | /// 529 | /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. 530 | /// 531 | /// RCPPS xmm, xmm 532 | /// __m128 {FP32} 533 | /// __m128 dst {FP32} 534 | public static __m128 _mm_rcp_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.Reciprocal(a.FP32); 535 | 536 | /// 537 | /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. 538 | /// 539 | /// RCPSS xmm, xmm 540 | /// __m128 {FP32} 541 | /// __m128 dst {FP32} 542 | public static __m128 _mm_rcp_ss(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ReciprocalScalar(a.FP32); 543 | 544 | /// 545 | /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. 546 | /// 547 | /// RSQRTPS xmm, xmm 548 | /// __m128 {FP32} 549 | /// __m128 dst {FP32} 550 | public static __m128 _mm_rsqrt_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ReciprocalSqrt(a.FP32); 551 | 552 | /// 553 | /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. 554 | /// 555 | /// RSQRTSS xmm, xmm 556 | /// __m128 {FP32} 557 | /// __m128 dst {FP32} 558 | public static __m128 _mm_rsqrt_ss(__m128 a) => System.Runtime.Intrinsics.X86.Sse.ReciprocalSqrtScalar(a.FP32); 559 | 560 | /// 561 | /// Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". 562 | /// 563 | /// 564 | /// float {FP32} 565 | /// __m128 dst {FP32} 566 | public static __m128 _mm_set1_ps(float a) => System.Runtime.Intrinsics.Vector128.Create(a); 567 | 568 | /// 569 | /// Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. 570 | /// 571 | /// 572 | /// float {FP32} 573 | /// float {FP32} 574 | /// float {FP32} 575 | /// float {FP32} 576 | /// __m128 dst {FP32} 577 | public static __m128 _mm_setr_ps(float e3, float e2, float e1, float e0) => System.Runtime.Intrinsics.Vector128.Create(e3, e2, e1, e0); 578 | 579 | /// 580 | /// Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order. 581 | /// 582 | /// SFENCE 583 | /// void {} 584 | public static void _mm_sfence() => System.Runtime.Intrinsics.X86.Sse.StoreFence(); 585 | 586 | /// 587 | /// Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". 588 | /// 589 | /// SHUFPS xmm, xmm, imm8 590 | /// __m128 {FP32} 591 | /// __m128 {FP32} 592 | /// int {IMM} 593 | /// __m128 dst {FP32} 594 | public static __m128 _mm_shuffle_ps(__m128 a, __m128 b, int imm8) => System.Runtime.Intrinsics.X86.Sse.Shuffle(a.FP32, b.FP32, (byte)imm8); 595 | 596 | /// 597 | /// Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". 598 | /// 599 | /// SQRTPS xmm, xmm 600 | /// __m128 {FP32} 601 | /// __m128 dst {FP32} 602 | public static __m128 _mm_sqrt_ps(__m128 a) => System.Runtime.Intrinsics.X86.Sse.Sqrt(a.FP32); 603 | 604 | /// 605 | /// Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 606 | /// 607 | /// SQRTSS xmm, xmm 608 | /// __m128 {FP32} 609 | /// __m128 dst {FP32} 610 | public static __m128 _mm_sqrt_ss(__m128 a) => System.Runtime.Intrinsics.X86.Sse.SqrtScalar(a.FP32); 611 | 612 | /// 613 | /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. 614 | /// 615 | /// MOVAPS m128, xmm 616 | /// float {FP32} 617 | /// __m128 {FP32} 618 | /// void {} 619 | public static void _mm_store_ps(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreAligned(mem_addr, a.FP32); 620 | 621 | /// 622 | /// Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. 623 | /// 624 | /// MOVSS m32, xmm 625 | /// float {FP32} 626 | /// __m128 {FP32} 627 | /// void {} 628 | public static void _mm_store_ss(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreScalar(mem_addr, a.FP32); 629 | 630 | /// 631 | /// Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory. 632 | /// 633 | /// MOVHPS m64, xmm 634 | /// __m64 {FP32} 635 | /// __m128 {FP32} 636 | /// void {} 637 | public static void _mm_storeh_pi(__m64* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreHigh((float*)mem_addr, a.FP32); 638 | 639 | /// 640 | /// Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory. 641 | /// 642 | /// MOVLPS m64, xmm 643 | /// __m64 {FP32} 644 | /// __m128 {FP32} 645 | /// void {} 646 | public static void _mm_storel_pi(__m64* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreLow((float*)mem_addr, a.FP32); 647 | 648 | /// 649 | /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. 650 | /// 651 | /// MOVUPS m128, xmm 652 | /// float {FP32} 653 | /// __m128 {FP32} 654 | /// void {} 655 | public static void _mm_storeu_ps(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.Store(mem_addr, a.FP32); 656 | 657 | /// 658 | /// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. 659 | /// 660 | /// MOVNTPS m128, xmm 661 | /// float {FP32} 662 | /// __m128 {FP32} 663 | /// void {} 664 | public static void _mm_stream_ps(float* mem_addr, __m128 a) => System.Runtime.Intrinsics.X86.Sse.StoreAlignedNonTemporal(mem_addr, a.FP32); 665 | 666 | /// 667 | /// Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". 668 | /// 669 | /// SUBPS xmm, xmm 670 | /// __m128 {FP32} 671 | /// __m128 {FP32} 672 | /// __m128 dst {FP32} 673 | public static __m128 _mm_sub_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Subtract(a.FP32, b.FP32); 674 | 675 | /// 676 | /// Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 677 | /// 678 | /// SUBSS xmm, xmm 679 | /// __m128 {FP32} 680 | /// __m128 {FP32} 681 | /// __m128 dst {FP32} 682 | public static __m128 _mm_sub_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.SubtractScalar(a.FP32, b.FP32); 683 | 684 | /// 685 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. 686 | /// 687 | /// UCOMISS xmm, xmm 688 | /// __m128 {FP32} 689 | /// __m128 {FP32} 690 | /// bool k {UI8} 691 | public static bool _mm_ucomieq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedEqual(a.FP32, b.FP32); 692 | 693 | /// 694 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. 695 | /// 696 | /// UCOMISS xmm, xmm 697 | /// __m128 {FP32} 698 | /// __m128 {FP32} 699 | /// bool k {UI8} 700 | public static bool _mm_ucomige_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedGreaterThanOrEqual(a.FP32, b.FP32); 701 | 702 | /// 703 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. 704 | /// 705 | /// UCOMISS xmm, xmm 706 | /// __m128 {FP32} 707 | /// __m128 {FP32} 708 | /// bool k {UI8} 709 | public static bool _mm_ucomigt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedGreaterThan(a.FP32, b.FP32); 710 | 711 | /// 712 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. 713 | /// 714 | /// UCOMISS xmm, xmm 715 | /// __m128 {FP32} 716 | /// __m128 {FP32} 717 | /// bool k {UI8} 718 | public static bool _mm_ucomile_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedLessThanOrEqual(a.FP32, b.FP32); 719 | 720 | /// 721 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. 722 | /// 723 | /// UCOMISS xmm, xmm 724 | /// __m128 {FP32} 725 | /// __m128 {FP32} 726 | /// bool k {UI8} 727 | public static bool _mm_ucomilt_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedLessThan(a.FP32, b.FP32); 728 | 729 | /// 730 | /// Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. 731 | /// 732 | /// UCOMISS xmm, xmm 733 | /// __m128 {FP32} 734 | /// __m128 {FP32} 735 | /// bool k {UI8} 736 | public static bool _mm_ucomineq_ss(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.CompareScalarUnorderedNotEqual(a.FP32, b.FP32); 737 | 738 | /// 739 | /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". 740 | /// 741 | /// UNPCKHPS xmm, xmm 742 | /// __m128 {FP32} 743 | /// __m128 {FP32} 744 | /// __m128 dst {FP32} 745 | public static __m128 _mm_unpackhi_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.UnpackHigh(a.FP32, b.FP32); 746 | 747 | /// 748 | /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". 749 | /// 750 | /// UNPCKLPS xmm, xmm 751 | /// __m128 {FP32} 752 | /// __m128 {FP32} 753 | /// __m128 dst {FP32} 754 | public static __m128 _mm_unpacklo_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.UnpackLow(a.FP32, b.FP32); 755 | 756 | /// 757 | /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 758 | /// 759 | /// XORPS xmm, xmm 760 | /// __m128 {FP32} 761 | /// __m128 {FP32} 762 | /// __m128 dst {FP32} 763 | public static __m128 _mm_xor_ps(__m128 a, __m128 b) => System.Runtime.Intrinsics.X86.Sse.Xor(a.FP32, b.FP32); 764 | 765 | } 766 | } 767 | --------------------------------------------------------------------------------