├── .clang-format
├── CMakeLists.txt
├── Guts
    ├── bool1.h
    ├── conversion.h
    ├── emulation.h
    ├── f16.h
    ├── f32.h
    ├── f64.h
    ├── i32.h
    ├── math.h
    ├── other.h
    ├── packing.h
    ├── sorting.h
    ├── swizzle.h
    ├── tests.h
    └── u32.h
├── LICENSE.txt
├── README.md
├── ml.h
└── ml.hlsli


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | BasedOnStyle: Google
  3 | AccessModifierOffset: -4
  4 | AlignAfterOpenBracket: DontAlign
  5 | AlignArrayOfStructures: None
  6 | AlignConsecutiveAssignments:
  7 |   Enabled: false
  8 |   AcrossEmptyLines: false
  9 |   AcrossComments: false
 10 |   AlignCompound: false
 11 |   AlignFunctionPointers: false
 12 |   PadOperators: true
 13 | AlignConsecutiveBitFields:
 14 |   Enabled: true
 15 |   AcrossEmptyLines: false
 16 |   AcrossComments: false
 17 |   AlignCompound: false
 18 |   AlignFunctionPointers: false
 19 |   PadOperators: false
 20 | AlignConsecutiveDeclarations:
 21 |   Enabled: false
 22 |   AcrossEmptyLines: false
 23 |   AcrossComments: false
 24 |   AlignCompound: false
 25 |   AlignFunctionPointers: false
 26 |   PadOperators: false
 27 | AlignConsecutiveMacros:
 28 |   Enabled: true
 29 |   AcrossEmptyLines: false
 30 |   AcrossComments: false
 31 |   AlignCompound: false
 32 |   AlignFunctionPointers: false
 33 |   PadOperators: false
 34 | AlignConsecutiveShortCaseStatements:
 35 |   Enabled: false
 36 |   AcrossEmptyLines: false
 37 |   AcrossComments: false
 38 |   AlignCaseArrows: false
 39 |   AlignCaseColons: false
 40 | AlignConsecutiveTableGenBreakingDAGArgColons:
 41 |   Enabled: false
 42 |   AcrossEmptyLines: false
 43 |   AcrossComments: false
 44 |   AlignCompound: false
 45 |   AlignFunctionPointers: false
 46 |   PadOperators: false
 47 | AlignConsecutiveTableGenCondOperatorColons:
 48 |   Enabled: false
 49 |   AcrossEmptyLines: false
 50 |   AcrossComments: false
 51 |   AlignCompound: false
 52 |   AlignFunctionPointers: false
 53 |   PadOperators: false
 54 | AlignConsecutiveTableGenDefinitionColons:
 55 |   Enabled: false
 56 |   AcrossEmptyLines: false
 57 |   AcrossComments: false
 58 |   AlignCompound: false
 59 |   AlignFunctionPointers: false
 60 |   PadOperators: false
 61 | AlignEscapedNewlines: DontAlign
 62 | AlignOperands: DontAlign
 63 | AlignTrailingComments:
 64 |   Kind: Always
 65 |   OverEmptyLines: 0
 66 | AllowAllArgumentsOnNextLine: true
 67 | AllowAllParametersOfDeclarationOnNextLine: true
 68 | AllowBreakBeforeNoexceptSpecifier: Never
 69 | AllowShortBlocksOnASingleLine: Never
 70 | AllowShortCaseExpressionOnASingleLine: true
 71 | AllowShortCaseLabelsOnASingleLine: false
 72 | AllowShortCompoundRequirementOnASingleLine: true
 73 | AllowShortEnumsOnASingleLine: false
 74 | AllowShortFunctionsOnASingleLine: None
 75 | AllowShortIfStatementsOnASingleLine: Never
 76 | AllowShortLambdasOnASingleLine: All
 77 | AllowShortLoopsOnASingleLine: false
 78 | AlwaysBreakAfterDefinitionReturnType: None
 79 | AlwaysBreakBeforeMultilineStrings: true
 80 | AttributeMacros:
 81 |   - CPP
 82 |   - M256_ALIGN
 83 | BinPackArguments: true
 84 | BinPackParameters: true
 85 | BitFieldColonSpacing: Both
 86 | BraceWrapping:
 87 |   AfterCaseLabel: false
 88 |   AfterClass: false
 89 |   AfterControlStatement: Never
 90 |   AfterEnum: false
 91 |   AfterFunction: false
 92 |   AfterNamespace: false
 93 |   AfterObjCDeclaration: false
 94 |   AfterStruct: false
 95 |   AfterUnion: false
 96 |   AfterExternBlock: false
 97 |   BeforeCatch: false
 98 |   BeforeElse: false
 99 |   BeforeLambdaBody: false
100 |   BeforeWhile: false
101 |   IndentBraces: false
102 |   SplitEmptyFunction: true
103 |   SplitEmptyRecord: true
104 |   SplitEmptyNamespace: true
105 | BreakAdjacentStringLiterals: true
106 | BreakAfterAttributes: Leave
107 | BreakAfterJavaFieldAnnotations: false
108 | BreakAfterReturnType: None
109 | BreakArrays: true
110 | BreakBeforeBinaryOperators: All
111 | BreakBeforeBraces: Attach
112 | BreakBeforeConceptDeclarations: Always
113 | BreakBeforeInlineASMColon: OnlyMultiline
114 | BreakBeforeTernaryOperators: true
115 | BreakConstructorInitializers: BeforeComma
116 | BreakFunctionDefinitionParameters: false
117 | BreakInheritanceList: BeforeColon
118 | BreakStringLiterals: true
119 | BreakTemplateDeclarations: Yes
120 | ColumnLimit: 0
121 | CommentPragmas: "^ IWYU pragma:"
122 | CompactNamespaces: false
123 | ConstructorInitializerIndentWidth: 4
124 | ContinuationIndentWidth: 4
125 | Cpp11BracedListStyle: true
126 | DerivePointerAlignment: false
127 | DisableFormat: false
128 | EmptyLineAfterAccessModifier: Never
129 | EmptyLineBeforeAccessModifier: LogicalBlock
130 | ExperimentalAutoDetectBinPacking: false
131 | FixNamespaceComments: true
132 | ForEachMacros:
133 |   - BLABLA
134 | IfMacros:
135 |   - BLABLA
136 | IncludeBlocks: Preserve
137 | IncludeCategories:
138 |   - Regex: ^<ext/.*\.h>
139 |     Priority: 2
140 |     SortPriority: 0
141 |     CaseSensitive: false
142 |   - Regex: ^<.*\.h>
143 |     Priority: 1
144 |     SortPriority: 0
145 |     CaseSensitive: false
146 |   - Regex: ^<.*
147 |     Priority: 2
148 |     SortPriority: 0
149 |     CaseSensitive: false
150 |   - Regex: .*
151 |     Priority: 3
152 |     SortPriority: 0
153 |     CaseSensitive: false
154 | IncludeIsMainRegex: ([-_](test|unittest))?$
155 | IncludeIsMainSourceRegex: ""
156 | IndentAccessModifiers: false
157 | IndentCaseBlocks: false
158 | IndentCaseLabels: true
159 | IndentExternBlock: NoIndent
160 | IndentGotoLabels: true
161 | IndentPPDirectives: AfterHash
162 | IndentRequiresClause: true
163 | IndentWidth: 4
164 | IndentWrappedFunctionNames: false
165 | InsertBraces: false
166 | InsertNewlineAtEOF: false
167 | InsertTrailingCommas: None
168 | IntegerLiteralSeparator:
169 |   Binary: 0
170 |   BinaryMinDigits: 0
171 |   Decimal: 0
172 |   DecimalMinDigits: 0
173 |   Hex: 0
174 |   HexMinDigits: 0
175 | JavaScriptQuotes: Leave
176 | JavaScriptWrapImports: true
177 | KeepEmptyLines:
178 |   AtEndOfFile: false
179 |   AtStartOfBlock: false
180 |   AtStartOfFile: true
181 | LambdaBodyIndentation: Signature
182 | LineEnding: DeriveLF
183 | MacroBlockBegin: ""
184 | MacroBlockEnd: ""
185 | MainIncludeChar: Quote
186 | MaxEmptyLinesToKeep: 1
187 | NamespaceIndentation: None
188 | ObjCBinPackProtocolList: Never
189 | ObjCBlockIndentWidth: 2
190 | ObjCBreakBeforeNestedBlockParam: true
191 | ObjCSpaceAfterProperty: false
192 | ObjCSpaceBeforeProtocolList: true
193 | PPIndentWidth: -1
194 | PackConstructorInitializers: NextLine
195 | PenaltyBreakAssignment: 2
196 | PenaltyBreakBeforeFirstCallParameter: 1
197 | PenaltyBreakComment: 300
198 | PenaltyBreakFirstLessLess: 120
199 | PenaltyBreakOpenParenthesis: 0
200 | PenaltyBreakScopeResolution: 500
201 | PenaltyBreakString: 1000
202 | PenaltyBreakTemplateDeclaration: 10
203 | PenaltyExcessCharacter: 1000000
204 | PenaltyIndentedWhitespace: 0
205 | PenaltyReturnTypeOnItsOwnLine: 200
206 | PointerAlignment: Left
207 | QualifierAlignment: Leave
208 | RawStringFormats:
209 |   - Language: Cpp
210 |     Delimiters:
211 |       - cc
212 |       - CC
213 |       - cpp
214 |       - Cpp
215 |       - CPP
216 |       - c++
217 |       - C++
218 |     CanonicalDelimiter: ""
219 |     BasedOnStyle: google
220 |   - Language: TextProto
221 |     Delimiters:
222 |       - pb
223 |       - PB
224 |       - proto
225 |       - PROTO
226 |     EnclosingFunctions:
227 |       - EqualsProto
228 |       - EquivToProto
229 |       - PARSE_PARTIAL_TEXT_PROTO
230 |       - PARSE_TEST_PROTO
231 |       - PARSE_TEXT_PROTO
232 |       - ParseTextOrDie
233 |       - ParseTextProtoOrDie
234 |       - ParseTestProto
235 |       - ParsePartialTestProto
236 |     CanonicalDelimiter: pb
237 |     BasedOnStyle: google
238 | ReferenceAlignment: Pointer
239 | ReflowComments: true
240 | RemoveBracesLLVM: false
241 | RemoveParentheses: Leave
242 | RemoveSemicolon: false
243 | RequiresClausePosition: OwnLine
244 | RequiresExpressionIndentation: OuterScope
245 | SeparateDefinitionBlocks: Always
246 | ShortNamespaceLines: 1
247 | SkipMacroDefinitionBody: false
248 | SortIncludes: CaseSensitive
249 | SortJavaStaticImport: Before
250 | SortUsingDeclarations: LexicographicNumeric
251 | SpaceAfterCStyleCast: false
252 | SpaceAfterLogicalNot: false
253 | SpaceAfterTemplateKeyword: true
254 | SpaceAroundPointerQualifiers: Default
255 | SpaceBeforeAssignmentOperators: true
256 | SpaceBeforeCaseColon: false
257 | SpaceBeforeCpp11BracedList: false
258 | SpaceBeforeCtorInitializerColon: true
259 | SpaceBeforeInheritanceColon: true
260 | SpaceBeforeJsonColon: false
261 | SpaceBeforeParens: ControlStatements
262 | SpaceBeforeParensOptions:
263 |   AfterControlStatements: true
264 |   AfterForeachMacros: true
265 |   AfterFunctionDeclarationName: false
266 |   AfterFunctionDefinitionName: false
267 |   AfterIfMacros: true
268 |   AfterOverloadedOperator: false
269 |   AfterPlacementOperator: true
270 |   AfterRequiresInClause: false
271 |   AfterRequiresInExpression: false
272 |   BeforeNonEmptyParentheses: false
273 | SpaceBeforeRangeBasedForLoopColon: true
274 | SpaceBeforeSquareBrackets: false
275 | SpaceInEmptyBlock: false
276 | SpacesBeforeTrailingComments: 1
277 | SpacesInAngles: Never
278 | SpacesInContainerLiterals: true
279 | SpacesInLineCommentPrefix:
280 |   Minimum: 1
281 |   Maximum: -1
282 | SpacesInParens: Never
283 | SpacesInParensOptions:
284 |   ExceptDoubleParentheses: false
285 |   InConditionalStatements: false
286 |   InCStyleCasts: false
287 |   InEmptyParentheses: false
288 |   Other: false
289 | SpacesInSquareBrackets: false
290 | Standard: Auto
291 | StatementAttributeLikeMacros:
292 |   - M256_ALIGN
293 | StatementMacros:
294 |   - BLABLA
295 | TabWidth: 4
296 | TableGenBreakInsideDAGArg: DontBreak
297 | UseTab: Never
298 | VerilogBreakBetweenInstancePorts: true
299 | WhitespaceSensitiveMacros:
300 |   - BLABLA
301 | AlwaysBreakAfterReturnType: None
302 | AlwaysBreakTemplateDeclarations: Yes
303 | KeepEmptyLinesAtTheStartOfBlocks: false
304 | Language: Cpp
305 | SpaceInEmptyParentheses: false
306 | SpacesInCStyleCastParentheses: false
307 | SpacesInConditionalStatement: false
308 | SpacesInParentheses: false
309 | TypenameMacros:
310 |   - BLABLA
311 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.22...3.30)
 2 | 
 3 | include(FetchContent)
 4 | 
 5 | # Arm64?
 6 | if((CMAKE_GENERATOR_PLATFORM MATCHES "ARM64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64"))
 7 |     set(IS_ARM64 TRUE)
 8 | else()
 9 |     set(IS_ARM64 FALSE)
10 | endif()
11 | 
12 | # Download sse2neon for ARM
13 | if(IS_ARM64)
14 |     FetchContent_Declare(
15 |         sse2neon
16 |         GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
17 |         GIT_TAG master
18 |         GIT_SHALLOW 1
19 |     )
20 | 
21 |     message("MathLib: Downloading sse2neon...")
22 |     FetchContent_MakeAvailable(sse2neon)
23 | endif()
24 | 
25 | # Sources
26 | file(GLOB MATHLIB_H
27 |     "ml.h"
28 |     "ml.hlsli"
29 | )
30 | source_group("MathLib" FILES ${MATHLIB_H})
31 | 
32 | file(GLOB MATHLIB_GUTS "Guts/*")
33 | source_group("MathLib/Guts" FILES ${MATHLIB_GUTS})
34 | 
35 | set_property(SOURCE "ml.hlsli" PROPERTY VS_SETTINGS "ExcludedFromBuild=true")
36 | 
37 | # Library
38 | add_library(MathLib INTERFACE)
39 | 
40 | if(WIN32) # TODO: is MSVC?
41 |     target_sources(MathLib PRIVATE ${MATHLIB_H} ${MATHLIB_GUTS})
42 |     set_target_properties(MathLib PROPERTIES FOLDER "MathLib")
43 | endif()
44 | 
45 | if(IS_ARM64 AND MSVC)
46 |     target_compile_options(MathLib INTERFACE
47 |         # ARM64 builds require modern preprocessor
48 |         /Zc:preprocessor
49 |         # Suppress known warnings:
50 |         /wd4310 # cast truncates constant value
51 |         /wd4127 # conditional expression is constant
52 |     )
53 | endif()
54 | 
55 | target_include_directories(MathLib INTERFACE
56 |     .
57 |     $<$<BOOL:${IS_ARM64}>:${sse2neon_SOURCE_DIR}>
58 | )
59 | 


--------------------------------------------------------------------------------
/Guts/bool1.h:
--------------------------------------------------------------------------------
 1 | // © 2021 NVIDIA Corporation
 2 | 
 3 | #pragma once
 4 | 
 5 | struct bool2 {
 6 |     int32_t mask;
 7 | 
 8 | public:
 9 |     ML_INLINE bool2(int32_t m)
10 |         : mask(m) {
11 |     }
12 | 
13 |     ML_INLINE operator int2() const;
14 |     ML_INLINE operator uint2() const;
15 |     ML_INLINE operator float2() const;
16 |     ML_INLINE operator double2() const;
17 | };
18 | 
19 | struct bool3 {
20 |     int32_t mask;
21 | 
22 | public:
23 |     ML_INLINE bool3(int32_t m)
24 |         : mask(m) {
25 |     }
26 | 
27 |     ML_INLINE operator int3() const;
28 |     ML_INLINE operator uint3() const;
29 |     ML_INLINE operator float3() const;
30 |     ML_INLINE operator double3() const;
31 | };
32 | 
33 | struct bool4 {
34 |     int32_t mask;
35 | 
36 | public:
37 |     ML_INLINE bool4(int32_t m)
38 |         : mask(m) {
39 |     }
40 | 
41 |     ML_INLINE operator int4() const;
42 |     ML_INLINE operator uint4() const;
43 |     ML_INLINE operator float4() const;
44 |     ML_INLINE operator double4() const;
45 | };
46 | 
47 | ML_INLINE bool all(bool b) {
48 |     return b;
49 | }
50 | 
51 | ML_INLINE bool all(bool2 b) {
52 |     return (b.mask & ML_Mask(1, 1, 0, 0)) == ML_Mask(1, 1, 0, 0);
53 | }
54 | 
55 | ML_INLINE bool all(bool3 b) {
56 |     return (b.mask & ML_Mask(1, 1, 1, 0)) == ML_Mask(1, 1, 1, 0);
57 | }
58 | 
59 | ML_INLINE bool all(bool4 b) {
60 |     return (b.mask & ML_Mask(1, 1, 1, 1)) == ML_Mask(1, 1, 1, 1);
61 | }
62 | 
63 | ML_INLINE bool any(bool b) {
64 |     return b;
65 | }
66 | 
67 | ML_INLINE bool any(bool2 b) {
68 |     return (b.mask & ML_Mask(1, 1, 0, 0)) != 0;
69 | }
70 | 
71 | ML_INLINE bool any(bool3 b) {
72 |     return (b.mask & ML_Mask(1, 1, 1, 0)) != 0;
73 | }
74 | 
75 | ML_INLINE bool any(bool4 b) {
76 |     return (b.mask & ML_Mask(1, 1, 1, 1)) != 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/Guts/conversion.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | // asfloat
  6 | ML_INLINE float asfloat(uint32_t x) {
  7 |     return *(float*)&x;
  8 | }
  9 | 
 10 | ML_INLINE float2 asfloat(const uint2& x) {
 11 |     return float2(asfloat(x.x), asfloat(x.y));
 12 | }
 13 | 
 14 | ML_INLINE float4 asfloat(const uint4& x) {
 15 |     return _mm_castsi128_ps(x.xmm);
 16 | }
 17 | 
 18 | // asuint
 19 | ML_INLINE uint32_t asuint(float x) {
 20 |     return *(uint32_t*)&x;
 21 | }
 22 | 
 23 | ML_INLINE uint2 asuint(const float2& x) {
 24 |     return uint2(asuint(x.x), asuint(x.y));
 25 | }
 26 | 
 27 | ML_INLINE uint4 asuint(const float4& x) {
 28 |     return _mm_castps_si128(x.xmm);
 29 | }
 30 | 
 31 | // From bool2
 32 | ML_INLINE bool2::operator int2() const {
 33 |     return int2((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0);
 34 | }
 35 | 
 36 | ML_INLINE bool2::operator uint2() const {
 37 |     return uint2((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0);
 38 | }
 39 | 
 40 | ML_INLINE bool2::operator float2() const {
 41 |     return float2((mask & 0x1) ? 1.0f : 0.0f, (mask & 0x2) ? 1.0f : 0.0f);
 42 | }
 43 | 
 44 | ML_INLINE bool2::operator double2() const {
 45 |     return double2((mask & 0x1) ? 1.0 : 0.0, (mask & 0x2) ? 1.0 : 0.0);
 46 | }
 47 | 
 48 | // From bool3
 49 | ML_INLINE bool3::operator int3() const {
 50 |     return int3((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0);
 51 | }
 52 | 
 53 | ML_INLINE bool3::operator uint3() const {
 54 |     return uint3((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0);
 55 | }
 56 | 
 57 | ML_INLINE bool3::operator float3() const {
 58 |     return float3((mask & 0x1) ? 1.0f : 0.0f, (mask & 0x2) ? 1.0f : 0.0f, (mask & 0x4) ? 1.0f : 0.0f);
 59 | }
 60 | 
 61 | ML_INLINE bool3::operator double3() const {
 62 |     return double3((mask & 0x1) ? 1.0 : 0.0, (mask & 0x2) ? 1.0 : 0.0, (mask & 0x4) ? 1.0 : 0.0);
 63 | }
 64 | 
 65 | // From bool4
 66 | ML_INLINE bool4::operator int4() const {
 67 |     return int4((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0, (mask & 0x8) ? 1 : 0);
 68 | }
 69 | 
 70 | ML_INLINE bool4::operator uint4() const {
 71 |     return uint4((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0, (mask & 0x8) ? 1 : 0);
 72 | }
 73 | 
 74 | ML_INLINE bool4::operator float4() const {
 75 |     return float4((mask & 0x1) ? 1.0f : 0.0f, (mask & 0x2) ? 1.0f : 0.0f, (mask & 0x4) ? 1.0f : 0.0f, (mask & 0x8) ? 1.0f : 0.0f);
 76 | }
 77 | 
 78 | ML_INLINE bool4::operator double4() const {
 79 |     return double4((mask & 0x1) ? 1.0 : 0.0, (mask & 0x2) ? 1.0 : 0.0, (mask & 0x4) ? 1.0 : 0.0, (mask & 0x8) ? 1.0 : 0.0);
 80 | }
 81 | 
 82 | // From int2
 83 | ML_INLINE int2::operator uint2() const {
 84 |     return uint2((uint)x, (uint)y);
 85 | }
 86 | 
 87 | ML_INLINE int2::operator float2() const {
 88 |     return float2((float)x, (float)y);
 89 | }
 90 | 
 91 | ML_INLINE int2::operator double2() const {
 92 |     return double2((double)x, (double)y);
 93 | }
 94 | 
 95 | // From uint2
 96 | ML_INLINE uint2::operator int2() const {
 97 |     return int2((int32_t)x, (int32_t)y);
 98 | }
 99 | 
100 | ML_INLINE uint2::operator float2() const {
101 |     return float2((float)x, (float)y);
102 | }
103 | 
104 | ML_INLINE uint2::operator double2() const {
105 |     return double2((double)x, (double)y);
106 | }
107 | 
108 | // From float2
109 | ML_INLINE float2::operator int2() const {
110 |     return int2((int32_t)x, (int32_t)y);
111 | }
112 | 
113 | ML_INLINE float2::operator uint2() const {
114 |     return uint2((uint)x, (uint)y);
115 | }
116 | 
117 | ML_INLINE float2::operator double2() const {
118 |     return double2((double)x, (double)y);
119 | }
120 | 
121 | // From double2
122 | ML_INLINE double2::operator int2() const {
123 |     return int2((int32_t)x, (int32_t)y);
124 | }
125 | 
126 | ML_INLINE double2::operator uint2() const {
127 |     return uint2((uint)x, (uint)y);
128 | }
129 | 
130 | ML_INLINE double2::operator float2() const {
131 |     return float2((float)x, (float)y);
132 | }
133 | 
134 | // From int3
135 | ML_INLINE int3::operator uint3() const {
136 |     return xmm;
137 | }
138 | 
139 | ML_INLINE int3::operator float3() const {
140 |     return _mm_cvtepi32_ps(xmm);
141 | }
142 | 
143 | ML_INLINE int3::operator double3() const {
144 |     return _mm256_cvtepi32_pd(xmm);
145 | }
146 | 
147 | // From uint3
148 | ML_INLINE uint3::operator int3() const {
149 |     return xmm;
150 | }
151 | 
152 | ML_INLINE uint3::operator float3() const {
153 |     return _mm_cvtepi32_ps(xmm);
154 | }
155 | 
156 | ML_INLINE uint3::operator double3() const {
157 |     return _mm256_cvtepi32_pd(xmm);
158 | }
159 | 
160 | // From float3
161 | ML_INLINE float3::operator int3() const {
162 |     return _mm_cvtps_epi32(xmm);
163 | }
164 | 
165 | ML_INLINE float3::operator uint3() const {
166 |     return _mm_cvtps_epi32(xmm);
167 | }
168 | 
169 | ML_INLINE float3::operator double3() const {
170 |     return _mm256_cvtps_pd(xmm);
171 | }
172 | 
173 | // From double3
174 | ML_INLINE double3::operator int3() const {
175 |     return _mm256_cvtpd_epi32(ymm);
176 | }
177 | 
178 | ML_INLINE double3::operator uint3() const {
179 |     return _mm256_cvtpd_epi32(ymm);
180 | }
181 | 
182 | ML_INLINE double3::operator float3() const {
183 |     return _mm256_cvtpd_ps(ymm);
184 | }
185 | 
186 | // From int4
187 | ML_INLINE int4::operator uint4() const {
188 |     return xmm;
189 | }
190 | 
191 | ML_INLINE int4::operator float4() const {
192 |     return _mm_cvtepi32_ps(xmm);
193 | }
194 | 
195 | ML_INLINE int4::operator double4() const {
196 |     return _mm256_cvtepi32_pd(xmm);
197 | }
198 | 
199 | // From uint4
200 | ML_INLINE uint4::operator int4() const {
201 |     return xmm;
202 | }
203 | 
204 | ML_INLINE uint4::operator float4() const {
205 |     return _mm_cvtepi32_ps(xmm);
206 | }
207 | 
208 | ML_INLINE uint4::operator double4() const {
209 |     return _mm256_cvtepi32_pd(xmm);
210 | }
211 | 
212 | // From float4
213 | ML_INLINE float4::operator int4() const {
214 |     return _mm_cvtps_epi32(xmm);
215 | }
216 | 
217 | ML_INLINE float4::operator uint4() const {
218 |     return _mm_cvtps_epi32(xmm);
219 | }
220 | 
221 | ML_INLINE float4::operator double4() const {
222 |     return _mm256_cvtps_pd(xmm);
223 | }
224 | 
225 | // From double4
226 | ML_INLINE double4::operator int4() const {
227 |     return _mm256_cvtpd_epi32(ymm);
228 | }
229 | 
230 | ML_INLINE double4::operator uint4() const {
231 |     return _mm256_cvtpd_epi32(ymm);
232 | }
233 | 
234 | ML_INLINE double4::operator float4() const {
235 |     return _mm256_cvtpd_ps(ymm);
236 | }
237 | 
238 | // From float4x4
239 | ML_INLINE float4x4::operator double4x4() const {
240 |     double4x4 r;
241 |     r.ca[0] = _mm256_cvtps_pd(ca[0]);
242 |     r.ca[1] = _mm256_cvtps_pd(ca[1]);
243 |     r.ca[2] = _mm256_cvtps_pd(ca[2]);
244 |     r.ca[3] = _mm256_cvtps_pd(ca[3]);
245 | 
246 |     return r;
247 | }
248 | 
249 | // From double4x4
250 | ML_INLINE double4x4::operator float4x4() const {
251 |     float4x4 r;
252 |     r.ca[0] = _mm256_cvtpd_ps(ca[0]);
253 |     r.ca[1] = _mm256_cvtpd_ps(ca[1]);
254 |     r.ca[2] = _mm256_cvtpd_ps(ca[2]);
255 |     r.ca[3] = _mm256_cvtpd_ps(ca[3]);
256 | 
257 |     return r;
258 | }
259 | 


--------------------------------------------------------------------------------
/Guts/f16.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | #define F16_M_BITS 10
  6 | #define F16_E_BITS 5
  7 | #define F16_S_MASK 0x8000
  8 | 
  9 | template <uint32_t M_BITS, uint32_t E_BITS, uint32_t S_MASK>
 10 | ML_INLINE uint32_t ToSmallFloat(float x) {
 11 |     const int32_t E_MASK = (1 << E_BITS) - 1;
 12 |     const uint32_t INF = uint32_t(E_MASK) << uint32_t(M_BITS);
 13 |     const int32_t BIAS = E_MASK >> 1;
 14 |     const int32_t ROUND = 1 << (23 - M_BITS - 1);
 15 | 
 16 |     // decompose float
 17 |     uint32_t f32 = *(uint32_t*)&x;
 18 |     uint32_t packed = (f32 >> 16) & S_MASK;
 19 |     int32_t e = ((f32 >> 23) & 0xFF) - 127 + BIAS;
 20 |     int32_t m = f32 & 0x007FFFFF;
 21 | 
 22 |     if (e == 128 + BIAS) {
 23 |         // Inf
 24 |         packed |= INF;
 25 | 
 26 |         if (m) {
 27 |             // NaN
 28 |             m >>= 23 - M_BITS;
 29 |             packed |= m | (m == 0);
 30 |         }
 31 |     } else if (e > 0) {
 32 |         // round to nearest, round "0.5" up
 33 |         if (m & ROUND) {
 34 |             m += ROUND << 1;
 35 | 
 36 |             if (m & 0x00800000) {
 37 |                 // mantissa overflow
 38 |                 m = 0;
 39 |                 e++;
 40 |             }
 41 |         }
 42 | 
 43 |         if (e >= E_MASK) {
 44 |             // exponent overflow - flush to Inf
 45 |             packed |= INF;
 46 |         } else {
 47 |             // representable value
 48 |             m >>= 23 - M_BITS;
 49 |             packed |= (e << M_BITS) | m;
 50 |         }
 51 |     } else {
 52 |         // denormalized or zero
 53 |         m = ((m | 0x00800000) >> (1 - e)) + ROUND;
 54 |         m >>= 23 - M_BITS;
 55 |         packed |= m;
 56 |     }
 57 | 
 58 |     return packed;
 59 | }
 60 | 
 61 | template <int32_t M_BITS, int32_t E_BITS, int32_t S_MASK>
 62 | ML_INLINE float FromSmallFloat(uint32_t x) {
 63 |     const uint32_t E_MASK = (1 << E_BITS) - 1;
 64 |     const int32_t BIAS = E_MASK >> 1;
 65 |     const float DENORM_SCALE = 1.0f / (1 << (14 + M_BITS));
 66 |     const float NORM_SCALE = 1.0f / float(1 << M_BITS);
 67 | 
 68 |     int32_t s = (x & S_MASK) << 15;
 69 |     int32_t e = (x >> M_BITS) & E_MASK;
 70 |     int32_t m = x & ((1 << M_BITS) - 1);
 71 | 
 72 |     uFloat f;
 73 |     if (e == 0)
 74 |         f.f = DENORM_SCALE * m;
 75 |     else if (e == E_MASK)
 76 |         f.i = s | 0x7F800000 | (m << (23 - M_BITS));
 77 |     else {
 78 |         f.f = 1.0f + float(m) * NORM_SCALE;
 79 | 
 80 |         if (e < BIAS)
 81 |             f.f /= float(1 << (BIAS - e));
 82 |         else
 83 |             f.f *= float(1 << (e - BIAS));
 84 |     }
 85 | 
 86 |     if (s)
 87 |         f.f = -f.f;
 88 | 
 89 |     return f.f;
 90 | }
 91 | 
 92 | ML_INLINE uint32_t f32tof16(float x) {
 93 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
 94 |     v4f v = v4f_set(x, 0.0f, 0.0f, 0.0f);
 95 |     v4i p = v4f_to_h4(v);
 96 | 
 97 |     uint32_t r = _mm_cvtsi128_si32(p);
 98 | #else
 99 |     uint32_t r = ToSmallFloat<F16_M_BITS, F16_E_BITS, F16_S_MASK>(x);
100 | #endif
101 | 
102 |     return r;
103 | }
104 | 
105 | ML_INLINE float f16tof32(uint32_t x) {
106 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
107 |     v4i p = _mm_cvtsi32_si128(x);
108 |     v4f f = _mm_cvtph_ps(p);
109 | 
110 |     return _mm_cvtss_f32(f);
111 | #else
112 |     return FromSmallFloat<F16_M_BITS, F16_E_BITS, F16_S_MASK>(x);
113 | #endif
114 | }
115 | 
116 | #ifndef __ARM_NEON
117 | struct float16_t {
118 |     uint16_t x;
119 | 
120 |     ML_INLINE float16_t(float v) {
121 |         x = (uint16_t)f32tof16(v);
122 |     }
123 | 
124 |     ML_INLINE operator float() const {
125 |         return f16tof32(x);
126 |     }
127 | 
128 |     ML_INLINE float16_t() = default;
129 |     ML_INLINE float16_t(const float16_t&) = default;
130 |     ML_INLINE float16_t& operator=(const float16_t&) = default;
131 | };
132 | #endif
133 | 
134 | struct float16_t2 {
135 |     float16_t x, y;
136 | 
137 |     ML_INLINE float16_t2(const float16_t& x, const float16_t& y)
138 |         : x(x), y(y) {
139 |     }
140 | 
141 |     ML_INLINE float16_t2() = default;
142 |     ML_INLINE float16_t2(const float16_t2&) = default;
143 |     ML_INLINE float16_t2& operator=(const float16_t2&) = default;
144 | };
145 | 
146 | struct float16_t4 {
147 |     float16_t x, y, z, w;
148 | 
149 |     ML_INLINE float16_t4(const float16_t& x, const float16_t& y, const float16_t& z, const float16_t& w)
150 |         : x(x), y(y), z(z), w(w) {
151 |     }
152 | 
153 |     ML_INLINE float16_t4(const float16_t2& xy, const float16_t2& zw) {
154 |         *((float16_t2*)&x) = xy;
155 |         *((float16_t2*)&z) = zw;
156 |     }
157 | 
158 |     ML_INLINE float16_t4() = default;
159 |     ML_INLINE float16_t4(const float16_t4&) = default;
160 |     ML_INLINE float16_t4& operator=(const float16_t4&) = default;
161 | };
162 | 


--------------------------------------------------------------------------------
/Guts/f32.h:
--------------------------------------------------------------------------------
   1 | // © 2021 NVIDIA Corporation
   2 | 
   3 | #pragma once
   4 | 
   5 | //======================================================================================================================
   6 | // float2
   7 | //======================================================================================================================
   8 | 
   9 | union float2 {
  10 |     v2i mm;
  11 | 
  12 |     struct {
  13 |         float a[COORD_2D];
  14 |     };
  15 | 
  16 |     struct {
  17 |         float x, y;
  18 |     };
  19 | 
  20 |     ML_SWIZZLE_2(float2, float);
  21 | 
  22 | public:
  23 |     ML_INLINE float2()
  24 |         : mm(0) {
  25 |     }
  26 | 
  27 |     ML_INLINE float2(float c)
  28 |         : x(c), y(c) {
  29 |     }
  30 | 
  31 |     ML_INLINE float2(float _x, float _y)
  32 |         : x(_x), y(_y) {
  33 |     }
  34 | 
  35 |     ML_INLINE float2(const float2& v) = default;
  36 | 
  37 |     // Set
  38 | 
  39 |     ML_INLINE void operator=(const float2& v) {
  40 |         mm = v.mm;
  41 |     }
  42 | 
  43 |     // Conversion
  44 | 
  45 |     ML_INLINE operator int2() const;
  46 |     ML_INLINE operator uint2() const;
  47 |     ML_INLINE operator double2() const;
  48 | 
  49 |     // Compare
  50 | 
  51 |     ML_COMPARE_UNOPT(bool2, float2, <)
  52 |     ML_COMPARE_UNOPT(bool2, float2, <=)
  53 |     ML_COMPARE_UNOPT(bool2, float2, ==)
  54 |     ML_COMPARE_UNOPT(bool2, float2, >=)
  55 |     ML_COMPARE_UNOPT(bool2, float2, >)
  56 |     ML_COMPARE_UNOPT(bool2, float2, !=)
  57 | 
  58 |     // Ops
  59 | 
  60 |     ML_INLINE float2 operator-() const {
  61 |         return float2(-x, -y);
  62 |     }
  63 | 
  64 |     ML_OP_UNOPT(float2, float, -, -=)
  65 |     ML_OP_UNOPT(float2, float, +, +=)
  66 |     ML_OP_UNOPT(float2, float, *, *=)
  67 |     ML_OP_UNOPT(float2, float, /, /=)
  68 | };
  69 | 
  70 | ML_INLINE float2 degrees(const float2& x) {
  71 |     return x * (180.0f / acosf(-1.0f));
  72 | }
  73 | 
  74 | ML_INLINE float2 radians(const float2& x) {
  75 |     return x * (acosf(-1.0f) / 180.0f);
  76 | }
  77 | 
  78 | ML_INLINE float2 sign(const float2& x) {
  79 |     return float2(sign(x.x), sign(x.y));
  80 | }
  81 | 
  82 | ML_INLINE float2 abs(const float2& x) {
  83 |     return float2(abs(x.x), abs(x.y));
  84 | }
  85 | 
  86 | ML_INLINE float2 floor(const float2& x) {
  87 |     return float2(floor(x.x), floor(x.y));
  88 | }
  89 | 
  90 | ML_INLINE float2 round(const float2& x) {
  91 |     return float2(round(x.x), round(x.y));
  92 | }
  93 | 
  94 | ML_INLINE float2 ceil(const float2& x) {
  95 |     return float2(ceil(x.x), ceil(x.y));
  96 | }
  97 | 
  98 | ML_INLINE float2 frac(const float2& x) {
  99 |     return float2(frac(x.x), frac(x.y));
 100 | }
 101 | 
 102 | ML_INLINE float2 fmod(const float2& x, const float2& y) {
 103 |     return float2(fmod(x.x, y.x), fmod(x.y, y.y));
 104 | }
 105 | 
 106 | ML_INLINE float2 min(const float2& x, const float2& y) {
 107 |     return float2(min(x.x, y.x), min(x.y, y.y));
 108 | }
 109 | 
 110 | ML_INLINE float2 max(const float2& x, const float2& y) {
 111 |     return float2(max(x.x, y.x), max(x.y, y.y));
 112 | }
 113 | 
 114 | ML_INLINE float2 clamp(const float2& x, const float2& a, const float2& b) {
 115 |     return float2(clamp(x.x, a.x, b.x), clamp(x.y, a.y, b.y));
 116 | }
 117 | 
 118 | ML_INLINE float2 saturate(const float2& x) {
 119 |     return float2(clamp(x.x, 0.0f, 1.0f), clamp(x.y, 0.0f, 1.0f));
 120 | }
 121 | 
 122 | ML_INLINE float2 lerp(const float2& a, const float2& b, const float2& x) {
 123 |     return a + (b - a) * x;
 124 | }
 125 | 
 126 | ML_INLINE float2 linearstep(const float2& a, const float2& b, const float2& x) {
 127 |     return saturate((x - a) / (b - a));
 128 | }
 129 | 
 130 | ML_INLINE float2 smoothstep(const float2& a, const float2& b, const float2& x) {
 131 |     float2 t = linearstep(a, b, x);
 132 | 
 133 |     return t * t * (3.0f - 2.0f * t);
 134 | }
 135 | 
 136 | ML_INLINE float2 step(const float2& edge, const float2& x) {
 137 |     return float2(step(edge.x, x.x), step(edge.y, x.y));
 138 | }
 139 | 
 140 | ML_INLINE float2 sin(const float2& x) {
 141 |     return float2(sin(x.x), sin(x.y));
 142 | }
 143 | 
 144 | ML_INLINE float2 cos(const float2& x) {
 145 |     return float2(cos(x.x), cos(x.y));
 146 | }
 147 | 
 148 | ML_INLINE float2 tan(const float2& x) {
 149 |     return float2(tan(x.x), tan(x.y));
 150 | }
 151 | 
 152 | ML_INLINE float2 asin(const float2& x) {
 153 |     return float2(asin(x.x), asin(x.y));
 154 | }
 155 | 
 156 | ML_INLINE float2 acos(const float2& x) {
 157 |     return float2(acos(x.x), acos(x.y));
 158 | }
 159 | 
 160 | ML_INLINE float2 atan(const float2& x) {
 161 |     return float2(atan(x.x), atan(x.y));
 162 | }
 163 | 
 164 | ML_INLINE float2 atan2(const float2& y, const float2& x) {
 165 |     return float2(atan2(y.x, x.x), atan2(y.y, x.y));
 166 | }
 167 | 
 168 | ML_INLINE float2 sqrt(const float2& x) {
 169 |     return float2(sqrt(x.x), sqrt(x.y));
 170 | }
 171 | 
 172 | ML_INLINE float2 rsqrt(const float2& x) {
 173 |     return float2(rsqrt(x.x), rsqrt(x.y));
 174 | }
 175 | 
 176 | ML_INLINE float2 rcp(const float2& x) {
 177 |     return float2(rcp(x.x), rcp(x.y));
 178 | }
 179 | 
 180 | ML_INLINE float2 pow(const float2& x, const float2& y) {
 181 |     return float2(pow(x.x, y.x), pow(x.y, y.y));
 182 | }
 183 | 
 184 | ML_INLINE float2 log(const float2& x) {
 185 |     return float2(log(x.x), log(x.y));
 186 | }
 187 | 
 188 | ML_INLINE float2 log2(const float2& x) {
 189 |     return float2(log2(x.x), log2(x.y));
 190 | }
 191 | 
 192 | ML_INLINE float2 exp(const float2& x) {
 193 |     return float2(exp(x.x), exp(x.y));
 194 | }
 195 | 
 196 | ML_INLINE float2 exp2(const float2& x) {
 197 |     return float2(exp2(x.x), exp2(x.y));
 198 | }
 199 | 
 200 | ML_INLINE float2 madd(const float2& a, const float2& b, const float2& c) {
 201 |     return a * b + c;
 202 | }
 203 | 
 204 | ML_INLINE float dot(const float2& a, const float2& b) {
 205 |     return a.x * b.x + a.y * b.y;
 206 | }
 207 | 
 208 | ML_INLINE float length(const float2& x) {
 209 |     return sqrt(dot(x, x));
 210 | }
 211 | 
 212 | ML_INLINE float2 normalize(const float2& x) {
 213 |     return x / length(x);
 214 | }
 215 | 
 216 | // non-HLSL
 217 | 
 218 | ML_INLINE float2 Pi(const float2& mul) {
 219 |     return mul * acosf(-1.0f);
 220 | }
 221 | 
 222 | ML_INLINE float2 GetPerpendicularVector(const float2& a) {
 223 |     return float2(-a.y, a.x);
 224 | }
 225 | 
 226 | ML_INLINE float2 Snap(const float2& x, const float2& step) {
 227 |     return round(x / step) * step;
 228 | }
 229 | 
 230 | ML_INLINE float2 Rotate(const float2& v, float angle) {
 231 |     float sa = sin(angle);
 232 |     float ca = cos(angle);
 233 | 
 234 |     float2 p;
 235 |     p.x = ca * v.x + sa * v.y;
 236 |     p.y = ca * v.y - sa * v.x;
 237 | 
 238 |     return p;
 239 | }
 240 | 
 241 | //======================================================================================================================
 242 | // float3
 243 | //======================================================================================================================
 244 | 
 245 | union float3 {
 246 |     v4f xmm;
 247 | 
 248 |     struct {
 249 |         float a[COORD_3D];
 250 |     };
 251 | 
 252 |     struct {
 253 |         float x, y, z;
 254 |     };
 255 | 
 256 |     ML_SWIZZLE_3(v4f_swizzle2, float2, v4f_swizzle3, float3);
 257 | 
 258 | public:
 259 |     ML_INLINE float3()
 260 |         : xmm(_mm_setzero_ps()) {
 261 |     }
 262 | 
 263 |     ML_INLINE float3(float c)
 264 |         : xmm(_mm_set1_ps(c)) {
 265 |     }
 266 | 
 267 |     ML_INLINE float3(float _x, float _y, float _z)
 268 |         : xmm(v4f_set(_x, _y, _z, 0.0f)) {
 269 |     }
 270 | 
 271 |     ML_INLINE float3(const float2& v, float _z)
 272 |         : xmm(v4f_set(v.x, v.y, _z, 0.0f)) {
 273 |     }
 274 | 
 275 |     ML_INLINE float3(float _x, const float2& v)
 276 |         : xmm(v4f_set(_x, v.x, v.y, 0.0f)) {
 277 |     }
 278 | 
 279 |     ML_INLINE float3(const v4f& v)
 280 |         : xmm(v) {
 281 |     }
 282 | 
 283 |     ML_INLINE float3(const float* v3)
 284 |         : xmm(v4f_set(v3[0], v3[1], v3[2], 0.0f)) {
 285 |     }
 286 | 
 287 |     ML_INLINE float3(const float3& v) = default;
 288 | 
 289 |     // Set
 290 | 
 291 |     ML_INLINE void operator=(const float3& v) {
 292 |         xmm = v.xmm;
 293 |     }
 294 | 
 295 |     // Conversion
 296 | 
 297 |     ML_INLINE operator int3() const;
 298 |     ML_INLINE operator uint3() const;
 299 |     ML_INLINE operator double3() const;
 300 | 
 301 |     // Compare
 302 | 
 303 |     ML_COMPARE(bool3, float3, <, _mm_cmplt_ps, _mm_movemask_ps, xmm)
 304 |     ML_COMPARE(bool3, float3, <=, _mm_cmple_ps, _mm_movemask_ps, xmm)
 305 |     ML_COMPARE(bool3, float3, ==, _mm_cmpeq_ps, _mm_movemask_ps, xmm)
 306 |     ML_COMPARE(bool3, float3, >, _mm_cmpgt_ps, _mm_movemask_ps, xmm)
 307 |     ML_COMPARE(bool3, float3, >=, _mm_cmpge_ps, _mm_movemask_ps, xmm)
 308 |     ML_COMPARE(bool3, float3, !=, _mm_cmpneq_ps, _mm_movemask_ps, xmm)
 309 | 
 310 |     // Ops
 311 | 
 312 |     ML_INLINE float3 operator-() const {
 313 |         return v4f_negate(xmm);
 314 |     }
 315 | 
 316 |     ML_OP(float3, float, -, -=, _mm_sub_ps, _mm_set1_ps, xmm)
 317 |     ML_OP(float3, float, +, +=, _mm_add_ps, _mm_set1_ps, xmm)
 318 |     ML_OP(float3, float, *, *=, _mm_mul_ps, _mm_set1_ps, xmm)
 319 |     ML_OP(float3, float, /, /=, _mm_div_ps, _mm_set1_ps, xmm)
 320 | 
 321 |     // Misc
 322 | 
 323 |     ML_INLINE operator v4f() const {
 324 |         return xmm;
 325 |     }
 326 | 
 327 |     static ML_INLINE float3 Zero() {
 328 |         return _mm_setzero_ps();
 329 |     }
 330 | };
 331 | 
 332 | ML_INLINE float3 degrees(const float3& x) {
 333 |     return x * (180.0f / acosf(-1.0f));
 334 | }
 335 | 
 336 | ML_INLINE float3 radians(const float3& x) {
 337 |     return x * (acosf(-1.0f) / 180.0f);
 338 | }
 339 | 
 340 | ML_INLINE float3 sign(const float3& x) {
 341 |     return v4f_sign(x.xmm);
 342 | }
 343 | 
 344 | ML_INLINE float3 abs(const float3& x) {
 345 |     return v4f_abs(x.xmm);
 346 | }
 347 | 
 348 | ML_INLINE float3 floor(const float3& x) {
 349 |     return v4f_floor(x.xmm);
 350 | }
 351 | 
 352 | ML_INLINE float3 round(const float3& x) {
 353 |     return v4f_round(x.xmm);
 354 | }
 355 | 
 356 | ML_INLINE float3 ceil(const float3& x) {
 357 |     return v4f_ceil(x.xmm);
 358 | }
 359 | 
 360 | ML_INLINE float3 frac(const float3& x) {
 361 |     return v4f_frac(x.xmm);
 362 | }
 363 | 
 364 | ML_INLINE float3 fmod(const float3& x, const float3& y) {
 365 |     return v4f_mod(x.xmm, y.xmm);
 366 | }
 367 | 
 368 | ML_INLINE float3 min(const float3& x, const float3& y) {
 369 |     return _mm_min_ps(x.xmm, y.xmm);
 370 | }
 371 | 
 372 | ML_INLINE float3 max(const float3& x, const float3& y) {
 373 |     return _mm_max_ps(x.xmm, y.xmm);
 374 | }
 375 | 
 376 | ML_INLINE float3 clamp(const float3& x, const float3& a, const float3& b) {
 377 |     return v4f_clamp(x.xmm, a.xmm, b.xmm);
 378 | }
 379 | 
 380 | ML_INLINE float3 saturate(const float3& x) {
 381 |     return v4f_saturate(x.xmm);
 382 | }
 383 | 
 384 | ML_INLINE float3 lerp(const float3& a, const float3& b, const float3& x) {
 385 |     return v4f_mix(a.xmm, b.xmm, x.xmm);
 386 | }
 387 | 
 388 | ML_INLINE float3 linearstep(const float3& a, const float3& b, const float3& x) {
 389 |     return v4f_linearstep(a.xmm, b.xmm, x.xmm);
 390 | }
 391 | 
 392 | ML_INLINE float3 smoothstep(const float3& a, const float3& b, const float3& x) {
 393 |     return v4f_smoothstep(a.xmm, b.xmm, x.xmm);
 394 | }
 395 | 
 396 | ML_INLINE float3 step(const float3& edge, const float3& x) {
 397 |     return v4f_step(edge.xmm, x.xmm);
 398 | }
 399 | 
 400 | ML_INLINE float3 sin(const float3& x) {
 401 |     return _mm_sin_ps(x.xmm);
 402 | }
 403 | 
 404 | ML_INLINE float3 cos(const float3& x) {
 405 |     return _mm_cos_ps(x.xmm);
 406 | }
 407 | 
 408 | ML_INLINE float3 tan(const float3& x) {
 409 |     return _mm_tan_ps(x.xmm);
 410 | }
 411 | 
 412 | ML_INLINE float3 asin(const float3& x) {
 413 |     ML_Assert(all(x >= float3(-1.0f)) && all(x <= float3(1.0f)));
 414 | 
 415 |     return _mm_asin_ps(x.xmm);
 416 | }
 417 | 
 418 | ML_INLINE float3 acos(const float3& x) {
 419 |     ML_Assert(all(x >= float3(-1.0f)) && all(x <= float3(1.0f)));
 420 | 
 421 |     return _mm_acos_ps(x.xmm);
 422 | }
 423 | 
 424 | ML_INLINE float3 atan(const float3& x) {
 425 |     return _mm_atan_ps(x.xmm);
 426 | }
 427 | 
 428 | ML_INLINE float3 atan2(const float3& y, const float3& x) {
 429 |     return _mm_atan2_ps(y.xmm, x.xmm);
 430 | }
 431 | 
 432 | ML_INLINE float3 sqrt(const float3& x) {
 433 |     return _mm_sqrt_ps(x.xmm);
 434 | }
 435 | 
 436 | ML_INLINE float3 rsqrt(const float3& x) {
 437 |     return v4f_rsqrt(x.xmm);
 438 | }
 439 | 
 440 | ML_INLINE float3 rcp(const float3& x) {
 441 |     return v4f_rcp(v4f_setw1(x.xmm));
 442 | }
 443 | 
 444 | ML_INLINE float3 pow(const float3& x, const float3& y) {
 445 |     return _mm_pow_ps(x.xmm, y.xmm);
 446 | }
 447 | 
 448 | ML_INLINE float3 log(const float3& x) {
 449 |     return _mm_log_ps(x.xmm);
 450 | }
 451 | 
 452 | ML_INLINE float3 log2(const float3& x) {
 453 |     return _mm_log2_ps(x.xmm);
 454 | }
 455 | 
 456 | ML_INLINE float3 exp(const float3& x) {
 457 |     return _mm_exp_ps(x.xmm);
 458 | }
 459 | 
 460 | ML_INLINE float3 exp2(const float3& x) {
 461 |     return _mm_exp2_ps(x.xmm);
 462 | }
 463 | 
 464 | ML_INLINE float3 madd(const float3& a, const float3& b, const float3& c) {
 465 |     return _mm_fmadd_ps(a.xmm, b.xmm, c.xmm);
 466 | }
 467 | 
 468 | ML_INLINE float dot(const float3& a, const float3& b) {
 469 |     v4f r = v4f_dot33(a.xmm, b.xmm);
 470 | 
 471 |     return _mm_cvtss_f32(r);
 472 | }
 473 | 
 474 | ML_INLINE float length(const float3& x) {
 475 |     v4f r = v4f_length(x.xmm);
 476 | 
 477 |     return _mm_cvtss_f32(r);
 478 | }
 479 | 
 480 | ML_INLINE float3 normalize(const float3& x) {
 481 |     return v4f_normalize(x.xmm);
 482 | }
 483 | 
 484 | ML_INLINE float3 cross(const float3& x, const float3& y) {
 485 |     return v4f_cross(x.xmm, y.xmm);
 486 | }
 487 | 
 488 | ML_INLINE float3 reflect(const float3& v, const float3& n) {
 489 |     // NOTE: slow
 490 |     // return v - n * dot(n, v) * 2;
 491 | 
 492 |     v4f dot0 = v4f_dot33(n.xmm, v.xmm);
 493 |     dot0 = _mm_mul_ps(dot0, _mm_set1_ps(2.0f));
 494 | 
 495 |     return _mm_fnmadd_ps(n.xmm, dot0, v.xmm);
 496 | }
 497 | 
 498 | ML_INLINE float3 refract(const float3& v, const float3& n, float eta) {
 499 |     // NOTE: slow
 500 |     /*
 501 |     float dot = dot(v, n);
 502 |     float k = 1 - eta * eta * (1 - dot * dot);
 503 | 
 504 |     if( k < 0 )
 505 |         return 0
 506 | 
 507 |     return v * eta - n * (eta * dot + Sqrt(k));
 508 |     */
 509 | 
 510 |     v4f eta0 = _mm_set1_ps(eta);
 511 |     v4f dot0 = v4f_dot33(n.xmm, v.xmm);
 512 |     v4f mul0 = _mm_mul_ps(eta0, eta0);
 513 |     v4f sub0 = _mm_fnmadd_ps(dot0, dot0, c_v4f_1111);
 514 |     v4f sub1 = _mm_fnmadd_ps(mul0, sub0, c_v4f_1111);
 515 | 
 516 |     if (v4f_isnegative4_all(sub1))
 517 |         return _mm_setzero_ps();
 518 | 
 519 |     v4f mul5 = _mm_mul_ps(eta0, v.xmm);
 520 |     v4f mul3 = _mm_mul_ps(eta0, dot0);
 521 |     v4f sqt0 = _mm_sqrt_ps(sub1);
 522 |     v4f add0 = _mm_add_ps(mul3, sqt0);
 523 | 
 524 |     return _mm_fnmadd_ps(add0, n.xmm, mul5);
 525 | }
 526 | 
 527 | // non-HLSL
 528 | 
 529 | ML_INLINE float3 Pi(const float3& mul) {
 530 |     return mul * acosf(-1.0f);
 531 | }
 532 | 
 533 | ML_INLINE float3 GetPerpendicularVector(const float3& N) {
 534 |     float3 T = float3(N.z, -N.x, N.y);
 535 |     T -= N * dot(T, N);
 536 | 
 537 |     return normalize(T);
 538 | }
 539 | 
 540 | ML_INLINE float3 SinCos(const float3& x, float3* pCos) {
 541 |     return _mm_sincos_ps(&pCos->xmm, x.xmm);
 542 | }
 543 | 
 544 | ML_INLINE float3 Snap(const float3& x, const float3& step) {
 545 |     return round(x / step) * step;
 546 | }
 547 | 
 548 | ML_INLINE bool IsPointsNear(const float3& p1, const float3& p2, float eps) {
 549 |     v4f r = _mm_sub_ps(p1.xmm, p2.xmm);
 550 |     r = v4f_abs(r);
 551 |     r = _mm_cmple_ps(r, _mm_set1_ps(eps));
 552 | 
 553 |     return v4f_test3_all(r);
 554 | }
 555 | 
 556 | //======================================================================================================================
 557 | // float4
 558 | //======================================================================================================================
 559 | 
 560 | union float4 {
 561 |     v4f xmm;
 562 | 
 563 |     struct {
 564 |         float a[COORD_4D];
 565 |     };
 566 | 
 567 |     struct {
 568 |         float x, y, z, w;
 569 |     };
 570 | 
 571 |     ML_SWIZZLE_4(v4f_swizzle2, float2, v4f_swizzle3, float3, v4f_swizzle4, float4);
 572 | 
 573 | public:
 574 |     ML_INLINE float4()
 575 |         : xmm(_mm_setzero_ps()) {
 576 |     }
 577 | 
 578 |     ML_INLINE float4(float c)
 579 |         : xmm(_mm_set1_ps(c)) {
 580 |     }
 581 | 
 582 |     ML_INLINE float4(float _x, float _y, float _z, float _w)
 583 |         : xmm(v4f_set(_x, _y, _z, _w)) {
 584 |     }
 585 | 
 586 |     ML_INLINE float4(const float3& v, float _w)
 587 |         : xmm(v4f_set(v.x, v.y, v.z, _w)) {
 588 |     }
 589 | 
 590 |     ML_INLINE float4(const float2& a, const float2& b)
 591 |         : xmm(v4f_set(a.x, a.y, b.x, b.y)) {
 592 |     }
 593 | 
 594 |     ML_INLINE float4(float _x, const float3& v)
 595 |         : xmm(v4f_set(_x, v.x, v.y, v.z)) {
 596 |     }
 597 | 
 598 |     ML_INLINE float4(const v4f& v)
 599 |         : xmm(v) {
 600 |     }
 601 | 
 602 |     ML_INLINE float4(const float* v4)
 603 |         : xmm(_mm_loadu_ps(v4)) {
 604 |     }
 605 | 
 606 |     ML_INLINE float4(const float4& v) = default;
 607 | 
 608 |     // Set
 609 | 
 610 |     ML_INLINE void operator=(const float4& v) {
 611 |         xmm = v.xmm;
 612 |     }
 613 | 
 614 |     // Conversion
 615 | 
 616 |     ML_INLINE operator int4() const;
 617 |     ML_INLINE operator uint4() const;
 618 |     ML_INLINE operator double4() const;
 619 | 
 620 |     // Compare
 621 | 
 622 |     ML_COMPARE(bool4, float4, <, _mm_cmplt_ps, _mm_movemask_ps, xmm)
 623 |     ML_COMPARE(bool4, float4, <=, _mm_cmple_ps, _mm_movemask_ps, xmm)
 624 |     ML_COMPARE(bool4, float4, ==, _mm_cmpeq_ps, _mm_movemask_ps, xmm)
 625 |     ML_COMPARE(bool4, float4, >, _mm_cmpgt_ps, _mm_movemask_ps, xmm)
 626 |     ML_COMPARE(bool4, float4, >=, _mm_cmpge_ps, _mm_movemask_ps, xmm)
 627 |     ML_COMPARE(bool4, float4, !=, _mm_cmpneq_ps, _mm_movemask_ps, xmm)
 628 | 
 629 |     // Ops
 630 | 
 631 |     ML_INLINE float4 operator-() const {
 632 |         return v4f_negate(xmm);
 633 |     }
 634 | 
 635 |     ML_OP(float4, float, -, -=, _mm_sub_ps, _mm_set1_ps, xmm)
 636 |     ML_OP(float4, float, +, +=, _mm_add_ps, _mm_set1_ps, xmm)
 637 |     ML_OP(float4, float, *, *=, _mm_mul_ps, _mm_set1_ps, xmm)
 638 |     ML_OP(float4, float, /, /=, _mm_div_ps, _mm_set1_ps, xmm)
 639 | 
 640 |     // Misc
 641 | 
 642 |     ML_INLINE operator v4f() const {
 643 |         return xmm;
 644 |     }
 645 | 
 646 |     static ML_INLINE float4 Zero() {
 647 |         return _mm_setzero_ps();
 648 |     }
 649 | };
 650 | 
 651 | ML_INLINE float4 degrees(const float4& x) {
 652 |     return x * (180.0f / acosf(-1.0f));
 653 | }
 654 | 
 655 | ML_INLINE float4 radians(const float4& x) {
 656 |     return x * (acosf(-1.0f) / 180.0f);
 657 | }
 658 | 
 659 | ML_INLINE float4 sign(const float4& x) {
 660 |     return v4f_sign(x.xmm);
 661 | }
 662 | 
 663 | ML_INLINE float4 abs(const float4& x) {
 664 |     return v4f_abs(x.xmm);
 665 | }
 666 | 
 667 | ML_INLINE float4 floor(const float4& x) {
 668 |     return v4f_floor(x.xmm);
 669 | }
 670 | 
 671 | ML_INLINE float4 round(const float4& x) {
 672 |     return v4f_round(x.xmm);
 673 | }
 674 | 
 675 | ML_INLINE float4 ceil(const float4& x) {
 676 |     return v4f_ceil(x.xmm);
 677 | }
 678 | 
 679 | ML_INLINE float4 frac(const float4& x) {
 680 |     return v4f_frac(x.xmm);
 681 | }
 682 | 
 683 | ML_INLINE float4 fmod(const float4& x, const float4& y) {
 684 |     return v4f_mod(x.xmm, y.xmm);
 685 | }
 686 | 
 687 | ML_INLINE float4 min(const float4& x, const float4& y) {
 688 |     return _mm_min_ps(x.xmm, y.xmm);
 689 | }
 690 | 
 691 | ML_INLINE float4 max(const float4& x, const float4& y) {
 692 |     return _mm_max_ps(x.xmm, y.xmm);
 693 | }
 694 | 
 695 | ML_INLINE float4 clamp(const float4& x, const float4& a, const float4& b) {
 696 |     return v4f_clamp(x.xmm, a.xmm, b.xmm);
 697 | }
 698 | 
 699 | ML_INLINE float4 saturate(const float4& x) {
 700 |     return v4f_saturate(x.xmm);
 701 | }
 702 | 
 703 | ML_INLINE float4 lerp(const float4& a, const float4& b, const float4& x) {
 704 |     return v4f_mix(a.xmm, b.xmm, x.xmm);
 705 | }
 706 | 
 707 | ML_INLINE float4 linearstep(const float4& a, const float4& b, const float4& x) {
 708 |     return v4f_linearstep(a.xmm, b.xmm, x.xmm);
 709 | }
 710 | 
 711 | ML_INLINE float4 smoothstep(const float4& a, const float4& b, const float4& x) {
 712 |     return v4f_smoothstep(a.xmm, b.xmm, x.xmm);
 713 | }
 714 | 
 715 | ML_INLINE float4 step(const float4& edge, const float4& x) {
 716 |     return v4f_step(edge.xmm, x.xmm);
 717 | }
 718 | 
 719 | ML_INLINE float4 sin(const float4& x) {
 720 |     return _mm_sin_ps(x.xmm);
 721 | }
 722 | 
 723 | ML_INLINE float4 cos(const float4& x) {
 724 |     return _mm_cos_ps(x.xmm);
 725 | }
 726 | 
 727 | ML_INLINE float4 tan(const float4& x) {
 728 |     return _mm_tan_ps(x.xmm);
 729 | }
 730 | 
 731 | ML_INLINE float4 asin(const float4& x) {
 732 |     ML_Assert(all(x >= float4(-1.0f)) && all(x <= float4(1.0f)));
 733 | 
 734 |     return _mm_asin_ps(x.xmm);
 735 | }
 736 | 
 737 | ML_INLINE float4 acos(const float4& x) {
 738 |     ML_Assert(all(x >= float4(-1.0f)) && all(x <= float4(1.0f)));
 739 | 
 740 |     return _mm_acos_ps(x.xmm);
 741 | }
 742 | 
 743 | ML_INLINE float4 atan(const float4& x) {
 744 |     return _mm_atan_ps(x.xmm);
 745 | }
 746 | 
 747 | ML_INLINE float4 atan2(const float4& y, const float4& x) {
 748 |     return _mm_atan2_ps(y.xmm, x.xmm);
 749 | }
 750 | 
 751 | ML_INLINE float4 sqrt(const float4& x) {
 752 |     return _mm_sqrt_ps(x.xmm);
 753 | }
 754 | 
 755 | ML_INLINE float4 rsqrt(const float4& x) {
 756 |     return v4f_rsqrt(x.xmm);
 757 | }
 758 | 
 759 | ML_INLINE float4 rcp(const float4& x) {
 760 |     return v4f_rcp(x.xmm);
 761 | }
 762 | 
 763 | ML_INLINE float4 pow(const float4& x, const float4& y) {
 764 |     return _mm_pow_ps(x.xmm, y.xmm);
 765 | }
 766 | 
 767 | ML_INLINE float4 log(const float4& x) {
 768 |     return _mm_log_ps(x.xmm);
 769 | }
 770 | 
 771 | ML_INLINE float4 log2(const float4& x) {
 772 |     return _mm_log2_ps(x.xmm);
 773 | }
 774 | 
 775 | ML_INLINE float4 exp(const float4& x) {
 776 |     return _mm_exp_ps(x.xmm);
 777 | }
 778 | 
 779 | ML_INLINE float4 exp2(const float4& x) {
 780 |     return _mm_exp2_ps(x.xmm);
 781 | }
 782 | 
 783 | ML_INLINE float4 madd(const float4& a, const float4& b, const float4& c) {
 784 |     return _mm_fmadd_ps(a.xmm, b.xmm, c.xmm);
 785 | }
 786 | 
 787 | ML_INLINE float dot(const float4& a, const float4& b) {
 788 |     v4f r = v4f_dot44(a.xmm, b.xmm);
 789 | 
 790 |     return _mm_cvtss_f32(r);
 791 | }
 792 | 
 793 | // Non-HLSL
 794 | 
 795 | ML_INLINE float4 Pi(const float4& mul) {
 796 |     return mul * acosf(-1.0f);
 797 | }
 798 | 
 799 | ML_INLINE float Dot43(const float4& a, const float3& b) {
 800 |     v4f r = v4f_dot43(a.xmm, b.xmm);
 801 | 
 802 |     return _mm_cvtss_f32(r);
 803 | }
 804 | 
 805 | ML_INLINE float4 Snap(const float4& x, const float4& step) {
 806 |     return round(x / step) * step;
 807 | }
 808 | 
 809 | ML_INLINE float4 SinCos(const float4& x, float4* pCos) {
 810 |     return _mm_sincos_ps(&pCos->xmm, x.xmm);
 811 | }
 812 | 
 813 | // TODO: add "Quaternion"
 814 | ML_INLINE float4 Slerp(const float4& a, const float4& b, float x) {
 815 |     ML_Assert(x >= 0.0f && x <= 1.0f);
 816 |     ML_Assert(abs(dot(a, a) - 1.0f) < 1e-5f);
 817 |     ML_Assert(abs(dot(b, b) - 1.0f) < 1e-5f);
 818 | 
 819 |     float4 r;
 820 | 
 821 |     float theta = dot(a, b);
 822 |     if (theta > 0.9995f)
 823 |         r = lerp(a, b, x);
 824 |     else {
 825 |         theta = acos(theta);
 826 | 
 827 |         float3 s = sin(theta * float3(1.0f, 1.0f - x, x));
 828 |         float sn = 1.0f / s.x;
 829 |         float wa = s.y * sn;
 830 |         float wb = s.z * sn;
 831 | 
 832 |         r = a * wa + b * wb;
 833 |     }
 834 | 
 835 |     r *= rsqrt(dot(r, r));
 836 | 
 837 |     return r;
 838 | }
 839 | 
 840 | //======================================================================================================================
 841 | // float4x4
 842 | //======================================================================================================================
 843 | 
 844 | // IMPORTANT: store - "column-major", usage - "row-major" (vector is a column)
 845 | union float4x4 {
 846 |     // Column array
 847 |     struct {
 848 |         float4 ca[COORD_4D];
 849 | 
 850 |         /*
 851 |         TODO: at least older GCC version don't allow this:
 852 | 
 853 |         float4 c0;
 854 |         float4 c1;
 855 |         float4 c2;
 856 |         float4 c3;
 857 | 
 858 |         because of this errors:
 859 |          - member with constructor not allowed in anonymous aggregate
 860 |          - member with copy assignment operator not allowed in anonymous aggregate
 861 |         */
 862 |     };
 863 | 
 864 |     // Element array
 865 |     struct {
 866 |         float a[COORD_4D * COORD_4D];
 867 |     };
 868 | 
 869 |     // Elements aXY, where X - row, Y - column
 870 |     struct {
 871 |         float a00, a10, a20, a30;
 872 |         float a01, a11, a21, a31;
 873 |         float a02, a12, a22, a32;
 874 |         float a03, a13, a23, a33;
 875 |     };
 876 | 
 877 | public:
 878 |     ML_INLINE float4x4() {
 879 |         ca[0] = _mm_setzero_ps();
 880 |         ca[1] = _mm_setzero_ps();
 881 |         ca[2] = _mm_setzero_ps();
 882 |         ca[3] = _mm_setzero_ps();
 883 |     }
 884 | 
 885 |     ML_INLINE float4x4(float m00, float m01, float m02, float m03, float m10, float m11, float m12, float m13, float m20, float m21, float m22, float m23, float m30, float m31,
 886 |         float m32, float m33) {
 887 |         ca[0] = v4f_set(m00, m10, m20, m30);
 888 |         ca[1] = v4f_set(m01, m11, m21, m31);
 889 |         ca[2] = v4f_set(m02, m12, m22, m32);
 890 |         ca[3] = v4f_set(m03, m13, m23, m33);
 891 |     }
 892 | 
 893 |     ML_INLINE float4x4(const float4& c0, const float4& c1, const float4& c2, const float4& c3) {
 894 |         ca[0] = c0;
 895 |         ca[1] = c1;
 896 |         ca[2] = c2;
 897 |         ca[3] = c3;
 898 |     }
 899 | 
 900 |     ML_INLINE float4x4(const float4x4& m) = default;
 901 | 
 902 |     // Set
 903 | 
 904 |     ML_INLINE void operator=(const float4x4& m) {
 905 |         ca[0] = m.ca[0];
 906 |         ca[1] = m.ca[1];
 907 |         ca[2] = m.ca[2];
 908 |         ca[3] = m.ca[3];
 909 |     }
 910 | 
 911 |     // Conversion
 912 | 
 913 |     ML_INLINE operator double4x4() const;
 914 | 
 915 |     // Compare
 916 | 
 917 |     ML_INLINE bool operator==(const float4x4& m) const {
 918 |         return all(ca[0] == m.ca[0]) && all(ca[1] == m.ca[1]) && all(ca[2] == m.ca[2]) && all(ca[3] == m.ca[3]);
 919 |     }
 920 | 
 921 |     ML_INLINE bool operator!=(const float4x4& m) const {
 922 |         return any(ca[0] != m.ca[0]) || any(ca[1] != m.ca[1]) || any(ca[2] != m.ca[2]) || any(ca[3] != m.ca[3]);
 923 |     }
 924 | 
 925 |     // NOTE: *
 926 | 
 927 |     ML_INLINE float4x4 operator*(const float4x4& m) const {
 928 |         float4x4 r;
 929 | 
 930 |         v4f r1 = _mm_mul_ps(v4f_swizzle(m.ca[0], 0, 0, 0, 0), ca[0]);
 931 |         v4f r2 = _mm_mul_ps(v4f_swizzle(m.ca[1], 0, 0, 0, 0), ca[0]);
 932 | 
 933 |         r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[0], 1, 1, 1, 1), ca[1], r1);
 934 |         r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[1], 1, 1, 1, 1), ca[1], r2);
 935 |         r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[0], 2, 2, 2, 2), ca[2], r1);
 936 |         r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[1], 2, 2, 2, 2), ca[2], r2);
 937 |         r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[0], 3, 3, 3, 3), ca[3], r1);
 938 |         r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[1], 3, 3, 3, 3), ca[3], r2);
 939 | 
 940 |         r.ca[0] = r1;
 941 |         r.ca[1] = r2;
 942 | 
 943 |         r1 = _mm_mul_ps(v4f_swizzle(m.ca[2], 0, 0, 0, 0), ca[0]);
 944 |         r2 = _mm_mul_ps(v4f_swizzle(m.ca[3], 0, 0, 0, 0), ca[0]);
 945 | 
 946 |         r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[2], 1, 1, 1, 1), ca[1], r1);
 947 |         r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[3], 1, 1, 1, 1), ca[1], r2);
 948 |         r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[2], 2, 2, 2, 2), ca[2], r1);
 949 |         r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[3], 2, 2, 2, 2), ca[2], r2);
 950 |         r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[2], 3, 3, 3, 3), ca[3], r1);
 951 |         r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[3], 3, 3, 3, 3), ca[3], r2);
 952 | 
 953 |         r.ca[2] = r1;
 954 |         r.ca[3] = r2;
 955 | 
 956 |         return r;
 957 |     }
 958 | 
 959 |     ML_INLINE float4 operator*(const float4& v) const {
 960 |         v4f r = _mm_mul_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), ca[0]);
 961 |         r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), ca[1], r);
 962 |         r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), ca[2], r);
 963 |         r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 3, 3, 3, 3), ca[3], r);
 964 | 
 965 |         return r;
 966 |     }
 967 | 
 968 |     ML_INLINE float3 operator*(const float3& v) const {
 969 |         v4f r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), ca[0], ca[3]);
 970 |         r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), ca[1], r);
 971 |         r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), ca[2], r);
 972 | 
 973 |         return r;
 974 |     }
 975 | 
 976 |     // Columns and rows
 977 | 
 978 |     float4& Col(uint32_t column) {
 979 |         ML_Assert(column < COORD_4D);
 980 | 
 981 |         return ca[column];
 982 |     }
 983 | 
 984 |     const float4& Col(uint32_t column) const {
 985 |         ML_Assert(column < COORD_4D);
 986 | 
 987 |         return ca[column];
 988 |     }
 989 | 
 990 |     float4& operator[](uint32_t column) {
 991 |         return Col(column);
 992 |     }
 993 | 
 994 |     const float4& operator[](uint32_t column) const {
 995 |         return Col(column);
 996 |     }
 997 | 
 998 |     ML_INLINE float4 Row(uint32_t row) const {
 999 |         ML_Assert(row < COORD_4D);
1000 | 
1001 |         return float4(a[row], a[COORD_4D + row], a[COORD_4D * 2 + row], a[COORD_4D * 3 + row]);
1002 |     }
1003 | 
1004 |     // NOTE: other
1005 | 
1006 |     static ML_INLINE float4x4 Identity() {
1007 |         return float4x4(1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f);
1008 |     }
1009 | 
1010 |     ML_INLINE float GetNdcDepth(float z) const {
1011 |         float c = a22 * z + a23;
1012 |         float d = a32 * z + a33;
1013 | 
1014 |         return c / d;
1015 |     }
1016 | 
1017 |     ML_INLINE float3 GetRotationYPR() const {
1018 |         float3 r;
1019 |         r.x = atan2(-a01, a11);
1020 |         r.y = asin(a21);
1021 |         r.z = atan2(-a20, a22);
1022 | 
1023 |         return r;
1024 |     }
1025 | 
1026 |     ML_INLINE float4 GetQuaternion() const {
1027 |         float4 q;
1028 |         float t;
1029 | 
1030 |         if (a22 < 0.0f) {
1031 |             if (a00 > a11) {
1032 |                 t = 1.0f + a00 - a11 - a22;
1033 |                 q = float4(t, a10 + a01, a02 + a20, a21 - a12);
1034 |             } else {
1035 |                 t = 1.0f - a00 + a11 - a22;
1036 |                 q = float4(a10 + a01, t, a21 + a12, a02 - a20);
1037 |             }
1038 |         } else {
1039 |             if (a00 < -a11) {
1040 |                 t = 1.0f - a00 - a11 + a22;
1041 |                 q = float4(a02 + a20, a21 + a12, t, a10 - a01);
1042 |             } else {
1043 |                 t = 1.0f + a00 + a11 + a22;
1044 |                 q = float4(a21 - a12, a02 - a20, a10 - a01, t);
1045 |             }
1046 |         }
1047 | 
1048 |         q *= 0.5f / sqrt(t);
1049 | 
1050 |         return q;
1051 |     }
1052 | 
1053 |     ML_INLINE float3 GetScale() const {
1054 |         float3 scale = float3(_mm_cvtss_f32(v4f_length(ca[0])), _mm_cvtss_f32(v4f_length(ca[1])), _mm_cvtss_f32(v4f_length(ca[2])));
1055 | 
1056 |         return scale;
1057 |     }
1058 | 
1059 |     ML_INLINE void SetTranslation(const float3& p) {
1060 |         ca[3] = v4f_setw1(p.xmm);
1061 |     }
1062 | 
1063 |     ML_INLINE void AddTranslation(const float3& p) {
1064 |         ca[3] = _mm_add_ps(ca[3], v4f_setw0(p.xmm));
1065 |     }
1066 | 
1067 |     ML_INLINE void PreTranslation(const float3& p);
1068 | 
1069 |     ML_INLINE void AddScale(const float3& scale) {
1070 |         ca[0] = _mm_mul_ps(ca[0], scale.xmm);
1071 |         ca[1] = _mm_mul_ps(ca[1], scale.xmm);
1072 |         ca[2] = _mm_mul_ps(ca[2], scale.xmm);
1073 |     }
1074 | 
1075 |     ML_INLINE void WorldToView(uint32_t uiProjFlags = 0) {
1076 |         /*
1077 |         float4x4 rot;
1078 |         rot.SetupByRotationX(c_fHalfPi);
1079 |         *this = (*this) * rot;
1080 |         InvertOrtho();
1081 |         */
1082 | 
1083 |         Swap(ca[1], ca[2]);
1084 | 
1085 |         if ((uiProjFlags & PROJ_LEFT_HANDED) == 0)
1086 |             ca[2] = v4f_negate(ca[2]);
1087 | 
1088 |         Transpose3x4();
1089 |     }
1090 | 
1091 |     ML_INLINE void ViewToWorld(uint32_t uiProjFlags = 0) {
1092 |         Transpose3x4();
1093 | 
1094 |         if ((uiProjFlags & PROJ_LEFT_HANDED) == 0)
1095 |             ca[2] = v4f_negate(ca[2]);
1096 | 
1097 |         Swap(ca[1], ca[2]);
1098 |     }
1099 | 
1100 |     ML_INLINE bool IsLeftHanded() const {
1101 |         float3 v1 = cross(float3(ca[0]), float3(ca[1]));
1102 | 
1103 |         return dot(v1, float3(ca[2])) < 0.0f;
1104 |     }
1105 | 
1106 |     ML_INLINE void TransposeTo(float4x4& m) const {
1107 |         v4f xmm0 = v4f_Ax_Bx_Ay_By(ca[0], ca[1]);
1108 |         v4f xmm1 = v4f_Ax_Bx_Ay_By(ca[2], ca[3]);
1109 |         v4f xmm2 = v4f_Az_Bz_Aw_Bw(ca[0], ca[1]);
1110 |         v4f xmm3 = v4f_Az_Bz_Aw_Bw(ca[2], ca[3]);
1111 | 
1112 |         m.ca[0] = v4f_Axy_Bxy(xmm0, xmm1);
1113 |         m.ca[1] = v4f_Azw_Bzw(xmm1, xmm0);
1114 |         m.ca[2] = v4f_Axy_Bxy(xmm2, xmm3);
1115 |         m.ca[3] = v4f_Azw_Bzw(xmm3, xmm2);
1116 |     }
1117 | 
1118 |     ML_INLINE void Transpose() {
1119 |         v4f xmm0 = v4f_Ax_Bx_Ay_By(ca[0], ca[1]);
1120 |         v4f xmm1 = v4f_Ax_Bx_Ay_By(ca[2], ca[3]);
1121 |         v4f xmm2 = v4f_Az_Bz_Aw_Bw(ca[0], ca[1]);
1122 |         v4f xmm3 = v4f_Az_Bz_Aw_Bw(ca[2], ca[3]);
1123 | 
1124 |         ca[0] = v4f_Axy_Bxy(xmm0, xmm1);
1125 |         ca[1] = v4f_Azw_Bzw(xmm1, xmm0);
1126 |         ca[2] = v4f_Axy_Bxy(xmm2, xmm3);
1127 |         ca[3] = v4f_Azw_Bzw(xmm3, xmm2);
1128 |     }
1129 | 
1130 |     ML_INLINE void Transpose3x4() {
1131 |         v4f xmm0 = v4f_Ax_Bx_Ay_By(ca[0], ca[1]);
1132 |         v4f xmm1 = v4f_Ax_Bx_Ay_By(ca[2], ca[3]);
1133 |         v4f xmm2 = v4f_Az_Bz_Aw_Bw(ca[0], ca[1]);
1134 |         v4f xmm3 = v4f_Az_Bz_Aw_Bw(ca[2], ca[3]);
1135 | 
1136 |         ca[0] = v4f_Axy_Bxy(xmm0, xmm1);
1137 |         ca[1] = v4f_Azw_Bzw(xmm1, xmm0);
1138 |         ca[2] = v4f_Axy_Bxy(xmm2, xmm3);
1139 |     }
1140 | 
1141 |     ML_INLINE void Invert() {
1142 |         // NOTE: http://forum.devmaster.net/t/sse-mat4-inverse/16799
1143 | 
1144 |         v4f Fac0;
1145 |         {
1146 |             v4f Swp0a = v4f_shuffle(ca[3], ca[2], 3, 3, 3, 3);
1147 |             v4f Swp0b = v4f_shuffle(ca[3], ca[2], 2, 2, 2, 2);
1148 | 
1149 |             v4f Swp00 = v4f_shuffle(ca[2], ca[1], 2, 2, 2, 2);
1150 |             v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2);
1151 |             v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2);
1152 |             v4f Swp03 = v4f_shuffle(ca[2], ca[1], 3, 3, 3, 3);
1153 | 
1154 |             v4f Mul00 = _mm_mul_ps(Swp00, Swp01);
1155 | 
1156 |             Fac0 = _mm_fnmadd_ps(Swp02, Swp03, Mul00);
1157 |         }
1158 | 
1159 |         v4f Fac1;
1160 |         {
1161 |             v4f Swp0a = v4f_shuffle(ca[3], ca[2], 3, 3, 3, 3);
1162 |             v4f Swp0b = v4f_shuffle(ca[3], ca[2], 1, 1, 1, 1);
1163 | 
1164 |             v4f Swp00 = v4f_shuffle(ca[2], ca[1], 1, 1, 1, 1);
1165 |             v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2);
1166 |             v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2);
1167 |             v4f Swp03 = v4f_shuffle(ca[2], ca[1], 3, 3, 3, 3);
1168 | 
1169 |             v4f Mul00 = _mm_mul_ps(Swp00, Swp01);
1170 | 
1171 |             Fac1 = _mm_fnmadd_ps(Swp02, Swp03, Mul00);
1172 |         }
1173 | 
1174 |         v4f Fac2;
1175 |         {
1176 |             v4f Swp0a = v4f_shuffle(ca[3], ca[2], 2, 2, 2, 2);
1177 |             v4f Swp0b = v4f_shuffle(ca[3], ca[2], 1, 1, 1, 1);
1178 | 
1179 |             v4f Swp00 = v4f_shuffle(ca[2], ca[1], 1, 1, 1, 1);
1180 |             v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2);
1181 |             v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2);
1182 |             v4f Swp03 = v4f_shuffle(ca[2], ca[1], 2, 2, 2, 2);
1183 | 
1184 |             v4f Mul00 = _mm_mul_ps(Swp00, Swp01);
1185 | 
1186 |             Fac2 = _mm_fnmadd_ps(Swp02, Swp03, Mul00);
1187 |         }
1188 | 
1189 |         v4f Fac3;
1190 |         {
1191 |             v4f Swp0a = v4f_shuffle(ca[3], ca[2], 3, 3, 3, 3);
1192 |             v4f Swp0b = v4f_shuffle(ca[3], ca[2], 0, 0, 0, 0);
1193 | 
1194 |             v4f Swp00 = v4f_shuffle(ca[2], ca[1], 0, 0, 0, 0);
1195 |             v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2);
1196 |             v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2);
1197 |             v4f Swp03 = v4f_shuffle(ca[2], ca[1], 3, 3, 3, 3);
1198 | 
1199 |             v4f Mul00 = _mm_mul_ps(Swp00, Swp01);
1200 | 
1201 |             Fac3 = _mm_fnmadd_ps(Swp02, Swp03, Mul00);
1202 |         }
1203 | 
1204 |         v4f Fac4;
1205 |         {
1206 |             v4f Swp0a = v4f_shuffle(ca[3], ca[2], 2, 2, 2, 2);
1207 |             v4f Swp0b = v4f_shuffle(ca[3], ca[2], 0, 0, 0, 0);
1208 | 
1209 |             v4f Swp00 = v4f_shuffle(ca[2], ca[1], 0, 0, 0, 0);
1210 |             v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2);
1211 |             v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2);
1212 |             v4f Swp03 = v4f_shuffle(ca[2], ca[1], 2, 2, 2, 2);
1213 | 
1214 |             v4f Mul00 = _mm_mul_ps(Swp00, Swp01);
1215 | 
1216 |             Fac4 = _mm_fnmadd_ps(Swp02, Swp03, Mul00);
1217 |         }
1218 | 
1219 |         v4f Fac5;
1220 |         {
1221 |             v4f Swp0a = v4f_shuffle(ca[3], ca[2], 1, 1, 1, 1);
1222 |             v4f Swp0b = v4f_shuffle(ca[3], ca[2], 0, 0, 0, 0);
1223 | 
1224 |             v4f Swp00 = v4f_shuffle(ca[2], ca[1], 0, 0, 0, 0);
1225 |             v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2);
1226 |             v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2);
1227 |             v4f Swp03 = v4f_shuffle(ca[2], ca[1], 1, 1, 1, 1);
1228 | 
1229 |             v4f Mul00 = _mm_mul_ps(Swp00, Swp01);
1230 | 
1231 |             Fac5 = _mm_fnmadd_ps(Swp02, Swp03, Mul00);
1232 |         }
1233 | 
1234 |         v4f SignA = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f);
1235 |         v4f SignB = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f);
1236 | 
1237 |         v4f Temp0 = v4f_shuffle(ca[1], ca[0], 0, 0, 0, 0);
1238 |         v4f Vec0 = v4f_swizzle(Temp0, 0, 2, 2, 2);
1239 | 
1240 |         v4f Temp1 = v4f_shuffle(ca[1], ca[0], 1, 1, 1, 1);
1241 |         v4f Vec1 = v4f_swizzle(Temp1, 0, 2, 2, 2);
1242 | 
1243 |         v4f Temp2 = v4f_shuffle(ca[1], ca[0], 2, 2, 2, 2);
1244 |         v4f Vec2 = v4f_swizzle(Temp2, 0, 2, 2, 2);
1245 | 
1246 |         v4f Temp3 = v4f_shuffle(ca[1], ca[0], 3, 3, 3, 3);
1247 |         v4f Vec3 = v4f_swizzle(Temp3, 0, 2, 2, 2);
1248 | 
1249 |         v4f Mul0 = _mm_mul_ps(Vec1, Fac0);
1250 |         v4f Mul1 = _mm_mul_ps(Vec0, Fac0);
1251 |         v4f Mul2 = _mm_mul_ps(Vec0, Fac1);
1252 |         v4f Mul3 = _mm_mul_ps(Vec0, Fac2);
1253 | 
1254 |         v4f Sub0 = _mm_fnmadd_ps(Vec2, Fac1, Mul0);
1255 |         v4f Sub1 = _mm_fnmadd_ps(Vec2, Fac3, Mul1);
1256 |         v4f Sub2 = _mm_fnmadd_ps(Vec1, Fac3, Mul2);
1257 |         v4f Sub3 = _mm_fnmadd_ps(Vec1, Fac4, Mul3);
1258 | 
1259 |         v4f Add0 = _mm_fmadd_ps(Vec3, Fac2, Sub0);
1260 |         v4f Add1 = _mm_fmadd_ps(Vec3, Fac4, Sub1);
1261 |         v4f Add2 = _mm_fmadd_ps(Vec3, Fac5, Sub2);
1262 |         v4f Add3 = _mm_fmadd_ps(Vec2, Fac5, Sub3);
1263 | 
1264 |         v4f Inv0 = _mm_mul_ps(SignB, Add0);
1265 |         v4f Inv1 = _mm_mul_ps(SignA, Add1);
1266 |         v4f Inv2 = _mm_mul_ps(SignB, Add2);
1267 |         v4f Inv3 = _mm_mul_ps(SignA, Add3);
1268 | 
1269 |         v4f Row0 = v4f_shuffle(Inv0, Inv1, 0, 0, 0, 0);
1270 |         v4f Row1 = v4f_shuffle(Inv2, Inv3, 0, 0, 0, 0);
1271 |         v4f Row2 = v4f_shuffle(Row0, Row1, 0, 2, 0, 2);
1272 | 
1273 |         v4f Det0 = v4f_dot44(ca[0], Row2);
1274 |         v4f Rcp0 = v4f_rcp(Det0);
1275 | 
1276 |         ca[0] = _mm_mul_ps(Inv0, Rcp0);
1277 |         ca[1] = _mm_mul_ps(Inv1, Rcp0);
1278 |         ca[2] = _mm_mul_ps(Inv2, Rcp0);
1279 |         ca[3] = _mm_mul_ps(Inv3, Rcp0);
1280 |     }
1281 | 
1282 |     ML_INLINE void InvertOrtho();
1283 | 
1284 |     // NOTE: special sets
1285 | 
1286 |     ML_INLINE void SetupByQuaternion(const float4& q) {
1287 |         // NOTE: assuming the quaternion is normalized
1288 |         float x2 = q.x + q.x;
1289 |         float y2 = q.y + q.y;
1290 |         float z2 = q.z + q.z;
1291 |         float xx2 = q.x * x2;
1292 |         float xy2 = q.x * y2;
1293 |         float xz2 = q.x * z2;
1294 |         float yy2 = q.y * y2;
1295 |         float yz2 = q.y * z2;
1296 |         float zz2 = q.z * z2;
1297 |         float wx2 = q.w * x2;
1298 |         float wy2 = q.w * y2;
1299 |         float wz2 = q.w * z2;
1300 | 
1301 |         ca[0] = float4(1.0f - (yy2 + zz2), xy2 + wz2, xz2 - wy2, 0.0f).xmm;
1302 |         ca[1] = float4(xy2 - wz2, 1.0f - (xx2 + zz2), yz2 + wx2, 0.0f).xmm;
1303 |         ca[2] = float4(xz2 + wy2, yz2 - wx2, 1.0f - (xx2 + yy2), 0.0f).xmm;
1304 |         ca[3] = c_v4f_0001;
1305 |     }
1306 | 
1307 |     ML_INLINE void SetupByRotationX(float angleX) {
1308 |         float ct = cos(angleX);
1309 |         float st = sin(angleX);
1310 | 
1311 |         ca[0] = float4(1.0f, 0.0f, 0.0f, 0.0f);
1312 |         ca[1] = float4(0.0f, ct, st, 0.0f);
1313 |         ca[2] = float4(0.0f, -st, ct, 0.0f);
1314 |         ca[3] = c_v4f_0001;
1315 |     }
1316 | 
1317 |     ML_INLINE void SetupByRotationY(float angleY) {
1318 |         float ct = cos(angleY);
1319 |         float st = sin(angleY);
1320 | 
1321 |         ca[0] = float4(ct, 0.0f, -st, 0.0f);
1322 |         ca[1] = float4(0.0f, 1.0f, 0.0f, 0.0f);
1323 |         ca[2] = float4(st, 0.0f, ct, 0.0f);
1324 |         ca[3] = c_v4f_0001;
1325 |     }
1326 | 
1327 |     ML_INLINE void SetupByRotationZ(float angleZ) {
1328 |         float ct = cos(angleZ);
1329 |         float st = sin(angleZ);
1330 | 
1331 |         ca[0] = float4(ct, st, 0.0f, 0.0f);
1332 |         ca[1] = float4(-st, ct, 0.0f, 0.0f);
1333 |         ca[2] = float4(0.0f, 0.0f, 1.0f, 0.0f);
1334 |         ca[3] = c_v4f_0001;
1335 |     }
1336 | 
1337 |     ML_INLINE void SetupByRotationYPR(float fYaw, float fPitch, float fRoll) {
1338 |         // NOTE: "yaw-pitch-roll" rotation
1339 |         //       yaw - around Z (object "down-up" axis)
1340 |         //       pitch - around X (object "left-right" axis)
1341 |         //       roll - around Y (object "backward-forward" axis)
1342 | 
1343 |         /*
1344 |         float4x4 rot;
1345 |         rot.SetupByRotationY(fRoll);
1346 |         *this = rot;
1347 |         rot.SetupByRotationX(fPitch);
1348 |         *this = rot * (*this);
1349 |         rot.SetupByRotationZ(fYaw);
1350 |         *this = rot * (*this);
1351 |         */
1352 | 
1353 |         float4 angles(fYaw, fPitch, fRoll, 0.0f);
1354 | 
1355 |         float4 c;
1356 |         float4 s = _mm_sincos_ps(&c.xmm, angles.xmm);
1357 | 
1358 |         a00 = c.x * c.z - s.x * s.y * s.z;
1359 |         a10 = s.x * c.z + c.x * s.y * s.z;
1360 |         a20 = -c.y * s.z;
1361 |         a30 = 0.0f;
1362 | 
1363 |         a01 = -s.x * c.y;
1364 |         a11 = c.x * c.y;
1365 |         a21 = s.y;
1366 |         a31 = 0.0f;
1367 | 
1368 |         a02 = c.x * s.z + c.z * s.x * s.y;
1369 |         a12 = s.x * s.z - c.x * s.y * c.z;
1370 |         a22 = c.y * c.z;
1371 |         a32 = 0.0f;
1372 | 
1373 |         ca[3] = c_v4f_0001;
1374 |     }
1375 | 
1376 |     ML_INLINE void SetupByRotation(float theta, const float3& v) {
1377 |         float ct = cos(theta);
1378 |         float st = sin(theta);
1379 | 
1380 |         SetupByRotation(st, ct, v);
1381 |     }
1382 | 
1383 |     ML_INLINE void SetupByRotation(float st, float ct, const float3& v) {
1384 |         float xx = v.x * v.x;
1385 |         float yy = v.y * v.y;
1386 |         float zz = v.z * v.z;
1387 |         float xy = v.x * v.y;
1388 |         float xz = v.x * v.z;
1389 |         float yz = v.y * v.z;
1390 |         float ctxy = ct * xy;
1391 |         float ctxz = ct * xz;
1392 |         float ctyz = ct * yz;
1393 |         float sty = st * v.y;
1394 |         float stx = st * v.x;
1395 |         float stz = st * v.z;
1396 | 
1397 |         a00 = xx + ct * (1.0f - xx);
1398 |         a01 = xy - ctxy - stz;
1399 |         a02 = xz - ctxz + sty;
1400 | 
1401 |         a10 = xy - ctxy + stz;
1402 |         a11 = yy + ct * (1.0f - yy);
1403 |         a12 = yz - ctyz - stx;
1404 | 
1405 |         a20 = xz - ctxz - sty;
1406 |         a21 = yz - ctyz + stx;
1407 |         a22 = zz + ct * (1.0f - zz);
1408 | 
1409 |         a30 = 0.0f;
1410 |         a31 = 0.0f;
1411 |         a32 = 0.0f;
1412 | 
1413 |         ca[3] = c_v4f_0001;
1414 |     }
1415 | 
1416 |     ML_INLINE void SetupByRotation(const float3& z, const float3& d) {
1417 |         /*
1418 |         // NOTE: same as
1419 | 
1420 |         float3 axis = cross(z, d);
1421 |         float angle = Acos( dot(z, d) );
1422 | 
1423 |         SetupByRotation(angle, axis);
1424 |         */
1425 | 
1426 |         float3 w = cross(z, d);
1427 |         float c = dot(z, d);
1428 |         float k = (1.0f - c) / (1.0f - c * c);
1429 | 
1430 |         float hxy = w.x * w.y * k;
1431 |         float hxz = w.x * w.z * k;
1432 |         float hyz = w.y * w.z * k;
1433 | 
1434 |         a00 = c + w.x * w.x * k;
1435 |         a01 = hxy - w.z;
1436 |         a02 = hxz + w.y;
1437 | 
1438 |         a10 = hxy + w.z;
1439 |         a11 = c + w.y * w.y * k;
1440 |         a12 = hyz - w.x;
1441 | 
1442 |         a20 = hxz - w.y;
1443 |         a21 = hyz + w.x;
1444 |         a22 = c + w.z * w.z * k;
1445 | 
1446 |         a30 = 0.0f;
1447 |         a31 = 0.0f;
1448 |         a32 = 0.0f;
1449 | 
1450 |         ca[3] = c_v4f_0001;
1451 |     }
1452 | 
1453 |     ML_INLINE void SetupByTranslation(const float3& p) {
1454 |         ca[0] = float4(1.0f, 0.0f, 0.0f, 0.0f);
1455 |         ca[1] = float4(0.0f, 1.0f, 0.0f, 0.0f);
1456 |         ca[2] = float4(0.0f, 0.0f, 1.0f, 0.0f);
1457 |         ca[3] = v4f_setw1(p);
1458 |     }
1459 | 
1460 |     ML_INLINE void SetupByScale(const float3& scale) {
1461 |         ca[0] = float4(scale.x, 0.0f, 0.0f, 0.0f);
1462 |         ca[1] = float4(0.0f, scale.y, 0.0f, 0.0f);
1463 |         ca[2] = float4(0.0f, 0.0f, scale.z, 0.0f);
1464 |         ca[3] = c_v4f_0001;
1465 |     }
1466 | 
1467 |     ML_INLINE void SetupByLookAt(const float3& vForward) {
1468 |         float3 y = normalize(vForward);
1469 |         float3 z = GetPerpendicularVector(y);
1470 |         float3 x = cross(y, z);
1471 | 
1472 |         ca[0] = v4f_setw0(x);
1473 |         ca[1] = v4f_setw0(y);
1474 |         ca[2] = v4f_setw0(z);
1475 |         ca[3] = c_v4f_0001;
1476 |     }
1477 | 
1478 |     ML_INLINE void SetupByLookAt(const float3& vForward, const float3& vRight) {
1479 |         float3 y = normalize(vForward);
1480 |         float3 z = normalize(cross(vRight, y));
1481 |         float3 x = cross(y, z);
1482 | 
1483 |         ca[0] = v4f_setw0(x);
1484 |         ca[1] = v4f_setw0(y);
1485 |         ca[2] = v4f_setw0(z);
1486 |         ca[3] = c_v4f_0001;
1487 |     }
1488 | 
1489 |     ML_INLINE void SetupByOrthoProjection(float left, float right, float bottom, float top, float zNear, float zFar, uint32_t uiProjFlags = 0) {
1490 |         ML_Assert(left < right);
1491 |         ML_Assert(bottom < top);
1492 | 
1493 |         float rWidth = 1.0f / (right - left);
1494 |         float rHeight = 1.0f / (top - bottom);
1495 |         float rDepth = 1.0f / (zFar - zNear);
1496 | 
1497 |         a00 = 2.0f * rWidth;
1498 |         a01 = 0.0f;
1499 |         a02 = 0.0f;
1500 |         a03 = -(right + left) * rWidth;
1501 | 
1502 |         a10 = 0.0f;
1503 |         a11 = 2.0f * rHeight;
1504 |         a12 = 0.0f;
1505 |         a13 = -(top + bottom) * rHeight;
1506 | 
1507 |         a20 = 0.0f;
1508 |         a21 = 0.0f;
1509 |         a22 = -2.0f * rDepth;
1510 |         a23 = -(zFar + zNear) * rDepth;
1511 | 
1512 |         a30 = 0.0f;
1513 |         a31 = 0.0f;
1514 |         a32 = 0.0f;
1515 |         a33 = 1.0f;
1516 | 
1517 |         bool bReverseZ = (uiProjFlags & PROJ_REVERSED_Z) != 0;
1518 | 
1519 |         a22 = ML_ModifyProjZ(bReverseZ, a22, a32);
1520 |         a23 = ML_ModifyProjZ(bReverseZ, a23, a33);
1521 | 
1522 |         if (uiProjFlags & PROJ_LEFT_HANDED)
1523 |             ca[2] = v4f_negate(ca[2]);
1524 |     }
1525 | 
1526 |     ML_INLINE void SetupByFrustum(float left, float right, float bottom, float top, float zNear, float zFar, uint32_t uiProjFlags = 0) {
1527 |         ML_Assert(left < right);
1528 |         ML_Assert(bottom < top);
1529 | 
1530 |         float rWidth = 1.0f / (right - left);
1531 |         float rHeight = 1.0f / (top - bottom);
1532 |         float rDepth = 1.0f / (zNear - zFar);
1533 | 
1534 |         a00 = 2.0f * zNear * rWidth;
1535 |         a01 = 0.0f;
1536 |         a02 = (right + left) * rWidth;
1537 |         a03 = 0.0f;
1538 | 
1539 |         a10 = 0.0f;
1540 |         a11 = 2.0f * zNear * rHeight;
1541 |         a12 = (top + bottom) * rHeight;
1542 |         a13 = 0.0f;
1543 | 
1544 |         a20 = 0.0f;
1545 |         a21 = 0.0f;
1546 |         a22 = (zFar + zNear) * rDepth;
1547 |         a23 = 2.0f * zFar * zNear * rDepth;
1548 | 
1549 |         a30 = 0.0f;
1550 |         a31 = 0.0f;
1551 |         a32 = -1.0f;
1552 |         a33 = 0.0f;
1553 | 
1554 |         bool bReverseZ = (uiProjFlags & PROJ_REVERSED_Z) != 0;
1555 | 
1556 |         a22 = ML_ModifyProjZ(bReverseZ, a22, a32);
1557 |         a23 = ML_ModifyProjZ(bReverseZ, a23, a33);
1558 | 
1559 |         if (uiProjFlags & PROJ_LEFT_HANDED)
1560 |             ca[2] = v4f_negate(ca[2]);
1561 |     }
1562 | 
1563 |     ML_INLINE void SetupByFrustumInf(float left, float right, float bottom, float top, float zNear, uint32_t uiProjFlags = 0) {
1564 |         ML_Assert(left < right);
1565 |         ML_Assert(bottom < top);
1566 | 
1567 |         float rWidth = 1.0f / (right - left);
1568 |         float rHeight = 1.0f / (top - bottom);
1569 | 
1570 |         a00 = 2.0f * zNear * rWidth;
1571 |         a01 = 0.0f;
1572 |         a02 = (right + left) * rWidth;
1573 |         a03 = 0.0f;
1574 | 
1575 |         a10 = 0.0f;
1576 |         a11 = 2.0f * zNear * rHeight;
1577 |         a12 = (top + bottom) * rHeight;
1578 |         a13 = 0.0f;
1579 | 
1580 |         a20 = 0.0f;
1581 |         a21 = 0.0f;
1582 |         a22 = -1.0f;
1583 |         a23 = -2.0f * zNear;
1584 | 
1585 |         a30 = 0.0f;
1586 |         a31 = 0.0f;
1587 |         a32 = -1.0f;
1588 |         a33 = 0.0f;
1589 | 
1590 |         bool bReverseZ = (uiProjFlags & PROJ_REVERSED_Z) != 0;
1591 | 
1592 |         a22 = ML_ModifyProjZ(bReverseZ, a22, a32);
1593 |         a23 = ML_ModifyProjZ(bReverseZ, a23, a33);
1594 | 
1595 |         if (uiProjFlags & PROJ_LEFT_HANDED)
1596 |             ca[2] = v4f_negate(ca[2]);
1597 |     }
1598 | 
1599 |     ML_INLINE void SetupByHalfFovy(float halfFovy, float aspect, float zNear, float zFar, uint32_t uiProjFlags = 0) {
1600 |         float ymax = zNear * tan(halfFovy);
1601 |         float xmax = ymax * aspect;
1602 | 
1603 |         SetupByFrustum(-xmax, xmax, -ymax, ymax, zNear, zFar, uiProjFlags);
1604 |     }
1605 | 
1606 |     ML_INLINE void SetupByHalfFovyInf(float halfFovy, float aspect, float zNear, uint32_t uiProjFlags = 0) {
1607 |         float ymax = zNear * tan(halfFovy);
1608 |         float xmax = ymax * aspect;
1609 | 
1610 |         SetupByFrustumInf(-xmax, xmax, -ymax, ymax, zNear, uiProjFlags);
1611 |     }
1612 | 
1613 |     ML_INLINE void SetupByHalfFovx(float halfFovx, float aspect, float zNear, float zFar, uint32_t uiProjFlags = 0) {
1614 |         float xmax = zNear * tan(halfFovx);
1615 |         float ymax = xmax / aspect;
1616 | 
1617 |         SetupByFrustum(-xmax, xmax, -ymax, ymax, zNear, zFar, uiProjFlags);
1618 |     }
1619 | 
1620 |     ML_INLINE void SetupByHalfFovxInf(float halfFovx, float aspect, float zNear, uint32_t uiProjFlags = 0) {
1621 |         float xmax = zNear * tan(halfFovx);
1622 |         float ymax = xmax / aspect;
1623 | 
1624 |         SetupByFrustumInf(-xmax, xmax, -ymax, ymax, zNear, uiProjFlags);
1625 |     }
1626 | 
1627 |     ML_INLINE void SetupByAngles(float angleMinx, float angleMaxx, float angleMiny, float angleMaxy, float zNear, float zFar, uint32_t uiProjFlags = 0) {
1628 |         float xmin = tan(angleMinx) * zNear;
1629 |         float xmax = tan(angleMaxx) * zNear;
1630 |         float ymin = tan(angleMiny) * zNear;
1631 |         float ymax = tan(angleMaxy) * zNear;
1632 | 
1633 |         SetupByFrustum(xmin, xmax, ymin, ymax, zNear, zFar, uiProjFlags);
1634 |     }
1635 | 
1636 |     ML_INLINE void SetupByAnglesInf(float angleMinx, float angleMaxx, float angleMiny, float angleMaxy, float zNear, uint32_t uiProjFlags = 0) {
1637 |         float xmin = tan(angleMinx) * zNear;
1638 |         float xmax = tan(angleMaxx) * zNear;
1639 |         float ymin = tan(angleMiny) * zNear;
1640 |         float ymax = tan(angleMaxy) * zNear;
1641 | 
1642 |         SetupByFrustumInf(xmin, xmax, ymin, ymax, zNear, uiProjFlags);
1643 |     }
1644 | 
1645 |     ML_INLINE void SubsampleProjection(float dx, float dy, uint32_t viewportWidth, uint32_t viewportHeight) {
1646 |         // NOTE: dx/dy in range [-1; 1]
1647 | 
1648 |         a02 += dx / float(viewportWidth);
1649 |         a12 += dy / float(viewportHeight);
1650 |     }
1651 | 
1652 |     ML_INLINE bool IsProjectionValid() const {
1653 |         // Do not check a20 and a21 to allow off-centered projections
1654 |         // Do not check a22 to allow reverse infinite projections
1655 | 
1656 |         return ((a00 != 0.0f && a10 == 0.0f && a20 == 0.0f && a30 == 0.0f) && (a01 == 0.0f && a11 != 0.0f && a21 == 0.0f && a31 == 0.0f) && (a32 == 1.0f || a32 == -1.0f) && (a03 == 0.0f && a13 == 0.0f && a23 != 0.0f && a33 == 0.0f));
1657 |     }
1658 | };
1659 | 
1660 | ML_INLINE float4 mul(const float4x4& m, const float4& v) {
1661 |     return m * v;
1662 | }
1663 | 
1664 | ML_INLINE float4x4 transpose(const float4x4& m) {
1665 |     float4x4 res;
1666 |     m.TransposeTo(res);
1667 | 
1668 |     return res;
1669 | }
1670 | 
1671 | // non-HLSL
1672 | 
1673 | ML_INLINE float3 Rotate(const float4x4& m, const float3& v) {
1674 |     v4f r = _mm_mul_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), m.ca[0]);
1675 |     r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), m.ca[1], r);
1676 |     r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), m.ca[2], r);
1677 |     r = v4f_setw0(r);
1678 | 
1679 |     return r;
1680 | }
1681 | 
1682 | ML_INLINE float3 RotateAbs(const float4x4& m, const float3& v) {
1683 |     v4f col0_abs = v4f_abs(m.ca[0]);
1684 |     v4f col1_abs = v4f_abs(m.ca[1]);
1685 |     v4f col2_abs = v4f_abs(m.ca[2]);
1686 | 
1687 |     v4f r = _mm_mul_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), col0_abs);
1688 |     r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), col1_abs, r);
1689 |     r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), col2_abs, r);
1690 | 
1691 |     return r;
1692 | }
1693 | 
1694 | ML_INLINE float3 Project(const float3& v, const float4x4& m) {
1695 |     float4 clip = (m * v).xmm;
1696 |     clip /= clip.w;
1697 | 
1698 |     return clip.xyz;
1699 | }
1700 | 
1701 | ML_INLINE void float4x4::PreTranslation(const float3& p) {
1702 |     v4f r = Rotate(*this, p.xmm).xmm;
1703 |     ca[3] = _mm_add_ps(ca[3], r);
1704 | }
1705 | 
1706 | ML_INLINE void float4x4::InvertOrtho() {
1707 |     Transpose3x4();
1708 | 
1709 |     ca[3] = Rotate(*this, float3(ca[3])).xmm;
1710 |     ca[3] = v4f_negate(ca[3]);
1711 | 
1712 |     ca[0] = v4f_setw0(ca[0]);
1713 |     ca[1] = v4f_setw0(ca[1]);
1714 |     ca[2] = v4f_setw0(ca[2]);
1715 |     ca[3] = v4f_setw1(ca[3]);
1716 | }
1717 | 
1718 | //======================================================================================================================
1719 | // cBoxf
1720 | //======================================================================================================================
1721 | 
1722 | struct cBoxf {
1723 |     float3 vMin;
1724 |     float3 vMax;
1725 | 
1726 | public:
1727 |     ML_INLINE cBoxf() {
1728 |         Clear();
1729 |     }
1730 | 
1731 |     ML_INLINE cBoxf(const float3& v) {
1732 |         vMin = v;
1733 |         vMax = v;
1734 |     }
1735 | 
1736 |     ML_INLINE cBoxf(const float3& minv, const float3& maxv) {
1737 |         vMin = minv;
1738 |         vMax = maxv;
1739 |     }
1740 | 
1741 |     ML_INLINE void Clear() {
1742 |         vMin = float3(c_v4f_Inf);
1743 |         vMax = float3(c_v4f_InfMinus);
1744 |     }
1745 | 
1746 |     ML_INLINE bool IsValid() const {
1747 |         v4f r = _mm_cmplt_ps(vMin.xmm, vMax.xmm);
1748 | 
1749 |         return v4f_test3_all(r);
1750 |     }
1751 | 
1752 |     ML_INLINE float3 GetCenter() const {
1753 |         return (vMin + vMax) * 0.5f;
1754 |     }
1755 | 
1756 |     ML_INLINE float GetRadius() const {
1757 |         return length(vMax - vMin) * 0.5f;
1758 |     }
1759 | 
1760 |     ML_INLINE void Scale(float fScale) {
1761 |         fScale *= 0.5f;
1762 | 
1763 |         float k1 = 0.5f + fScale;
1764 |         float k2 = 0.5f - fScale;
1765 | 
1766 |         float3 a = vMin * k1 + vMax * k2;
1767 |         float3 b = vMax * k1 + vMin * k2;
1768 | 
1769 |         vMin = a;
1770 |         vMax = b;
1771 |     }
1772 | 
1773 |     ML_INLINE void Enlarge(const float3& vBorder) {
1774 |         vMin -= vBorder;
1775 |         vMax += vBorder;
1776 |     }
1777 | 
1778 |     ML_INLINE void Add(const float3& v) {
1779 |         vMin = _mm_min_ps(vMin.xmm, v.xmm);
1780 |         vMax = _mm_max_ps(vMax.xmm, v.xmm);
1781 |     }
1782 | 
1783 |     ML_INLINE void Add(const cBoxf& b) {
1784 |         vMin = _mm_min_ps(vMin.xmm, b.vMin.xmm);
1785 |         vMax = _mm_max_ps(vMax.xmm, b.vMax.xmm);
1786 |     }
1787 | 
1788 |     ML_INLINE float DistanceSquared(const float3& from) const {
1789 |         v4f p = v4f_clamp(from.xmm, vMin.xmm, vMax.xmm);
1790 |         p = _mm_sub_ps(p, from.xmm);
1791 |         p = v4f_dot33(p, p);
1792 | 
1793 |         return _mm_cvtss_f32(p);
1794 |     }
1795 | 
1796 |     ML_INLINE float Distance(const float3& from) const {
1797 |         v4f p = v4f_clamp(from.xmm, vMin.xmm, vMax.xmm);
1798 |         p = _mm_sub_ps(p, from.xmm);
1799 |         p = v4f_length(p);
1800 | 
1801 |         return _mm_cvtss_f32(p);
1802 |     }
1803 | 
1804 |     ML_INLINE bool IsIntersectWith(const cBoxf& b) const {
1805 |         v4f r = _mm_cmplt_ps(vMax.xmm, b.vMin.xmm);
1806 |         r = _mm_or_ps(r, _mm_cmpgt_ps(vMin.xmm, b.vMax.xmm));
1807 | 
1808 |         return v4f_test3_none(r);
1809 |     }
1810 | 
1811 |     // NOTE: intersection state 'b' vs 'this'
1812 | 
1813 |     ML_INLINE eClip GetIntersectionState(const cBoxf& b) const {
1814 |         if (!IsIntersectWith(b))
1815 |             return CLIP_OUT;
1816 | 
1817 |         v4f r = _mm_cmplt_ps(vMin.xmm, b.vMin.xmm);
1818 |         r = _mm_and_ps(r, _mm_cmpgt_ps(vMax.xmm, b.vMax.xmm));
1819 | 
1820 |         return v4f_test3_all(r) ? CLIP_IN : CLIP_PARTIAL;
1821 |     }
1822 | 
1823 |     ML_INLINE bool IsContain(const float3& p) const {
1824 |         v4f r = _mm_cmplt_ps(p.xmm, vMin.xmm);
1825 |         r = _mm_or_ps(r, _mm_cmpgt_ps(p.xmm, vMax.xmm));
1826 | 
1827 |         return v4f_test3_none(r);
1828 |     }
1829 | 
1830 |     ML_INLINE bool IsContainSphere(const float3& center, float radius) const {
1831 |         v4f r = _mm_set1_ps(radius);
1832 |         v4f t = _mm_sub_ps(vMin.xmm, r);
1833 |         t = _mm_cmplt_ps(center.xmm, t);
1834 | 
1835 |         if (v4f_test3_any(t))
1836 |             return false;
1837 | 
1838 |         t = _mm_add_ps(vMax.xmm, r);
1839 |         t = _mm_cmpgt_ps(center.xmm, t);
1840 | 
1841 |         if (v4f_test3_any(t))
1842 |             return false;
1843 | 
1844 |         return true;
1845 |     }
1846 | 
1847 |     ML_INLINE uint32_t GetIntersectionBits(const cBoxf& b) const {
1848 |         v4f r = _mm_cmpge_ps(b.vMin.xmm, vMin.xmm);
1849 |         uint32_t bits = (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0));
1850 | 
1851 |         r = _mm_cmple_ps(b.vMax.xmm, vMax.xmm);
1852 |         bits |= (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0)) << 3;
1853 | 
1854 |         return bits;
1855 |     }
1856 | 
1857 |     ML_INLINE uint32_t IsContain(const float3& p, uint32_t bits) const {
1858 |         v4f r = _mm_cmpge_ps(p.xmm, vMin.xmm);
1859 |         bits |= (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0));
1860 | 
1861 |         r = _mm_cmple_ps(p.xmm, vMax.xmm);
1862 |         bits |= (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0)) << 3;
1863 | 
1864 |         return bits;
1865 |     }
1866 | 
1867 |     ML_INLINE bool IsIntersectWith(const float3& vRayPos, const float3& vRayDir, float* out_fTmin, float* out_fTmax) const {
1868 |         // NOTE: http://tavianator.com/2011/05/fast-branchless-raybounding-box-intersections/
1869 | 
1870 |         // IMPORTANT: store '1 / ray_dir' and filter INFs out!
1871 | 
1872 |         v4f t1 = _mm_div_ps(_mm_sub_ps(vMin.xmm, vRayPos.xmm), vRayDir.xmm);
1873 |         v4f t2 = _mm_div_ps(_mm_sub_ps(vMax.xmm, vRayPos.xmm), vRayDir.xmm);
1874 | 
1875 |         v4f vmin = _mm_min_ps(t1, t2);
1876 |         v4f vmax = _mm_max_ps(t1, t2);
1877 | 
1878 |         // NOTE: hmax.xxx
1879 |         v4f tmin = _mm_max_ps(vmin, v4f_swizzle(vmin, ML_Y, ML_Z, ML_X, 0));
1880 |         tmin = _mm_max_ps(tmin, v4f_swizzle(vmin, ML_Z, ML_X, ML_Y, 0));
1881 | 
1882 |         // NOTE: hmin.xxx
1883 |         v4f tmax = _mm_min_ps(vmax, v4f_swizzle(vmax, ML_Y, ML_Z, ML_X, 0));
1884 |         tmax = _mm_min_ps(tmax, v4f_swizzle(vmax, ML_Z, ML_X, ML_Y, 0));
1885 | 
1886 |         v4f_store_x(out_fTmin, tmin);
1887 |         v4f_store_x(out_fTmax, tmax);
1888 | 
1889 |         v4f cmp = _mm_cmpge_ps(tmax, tmin);
1890 | 
1891 |         return (_mm_movemask_ps(cmp) & ML_Mask(1, 0, 0, 0)) == ML_Mask(1, 0, 0, 0);
1892 |     }
1893 | };
1894 | 
1895 | ML_INLINE void TransformAabb(const float4x4& mTransform, const cBoxf& src, cBoxf& dst) {
1896 |     float3 center = (src.vMin + src.vMax) * 0.5f;
1897 |     float3 extends = src.vMax - center;
1898 | 
1899 |     center = mTransform * center;
1900 |     extends = RotateAbs(mTransform, extends);
1901 | 
1902 |     dst.vMin = center - extends;
1903 |     dst.vMax = center + extends;
1904 | }
1905 | 


--------------------------------------------------------------------------------
/Guts/i32.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | //======================================================================================================================
  6 | // int2
  7 | //======================================================================================================================
  8 | 
  9 | union int2 {
 10 |     v2i mm;
 11 | 
 12 |     struct {
 13 |         int32_t a[COORD_2D];
 14 |     };
 15 | 
 16 |     struct {
 17 |         int32_t x, y;
 18 |     };
 19 | 
 20 |     ML_SWIZZLE_2(int2, int32_t);
 21 | 
 22 | public:
 23 |     ML_INLINE int2()
 24 |         : mm(0) {
 25 |     }
 26 | 
 27 |     ML_INLINE int2(int32_t c)
 28 |         : x(c), y(c) {
 29 |     }
 30 | 
 31 |     ML_INLINE int2(int32_t _x, int32_t _y)
 32 |         : x(_x), y(_y) {
 33 |     }
 34 | 
 35 |     ML_INLINE int2(const int2& v) = default;
 36 | 
 37 |     // Set
 38 | 
 39 |     ML_INLINE void operator=(const int2& v) {
 40 |         mm = v.mm;
 41 |     }
 42 | 
 43 |     // Conversion
 44 | 
 45 |     ML_INLINE operator uint2() const;
 46 |     ML_INLINE operator float2() const;
 47 |     ML_INLINE operator double2() const;
 48 | 
 49 |     // Compare
 50 | 
 51 |     ML_COMPARE_UNOPT(bool2, int2, <)
 52 |     ML_COMPARE_UNOPT(bool2, int2, <=)
 53 |     ML_COMPARE_UNOPT(bool2, int2, ==)
 54 |     ML_COMPARE_UNOPT(bool2, int2, >=)
 55 |     ML_COMPARE_UNOPT(bool2, int2, >)
 56 |     ML_COMPARE_UNOPT(bool2, int2, !=)
 57 | 
 58 |     // Ops
 59 | 
 60 |     ML_INLINE int2 operator-() const {
 61 |         return int2(-x, -y);
 62 |     }
 63 | 
 64 |     ML_OP_UNOPT(int2, int32_t, -, -=)
 65 |     ML_OP_UNOPT(int2, int32_t, +, +=)
 66 |     ML_OP_UNOPT(int2, int32_t, *, *=)
 67 |     ML_OP_UNOPT(int2, int32_t, /, /=)
 68 |     ML_OP_UNOPT(int2, int32_t, %, %=)
 69 |     ML_OP_UNOPT(int2, int32_t, <<, <<=)
 70 |     ML_OP_UNOPT(int2, int32_t, >>, >>=)
 71 |     ML_OP_UNOPT(int2, int32_t, &, &=)
 72 |     ML_OP_UNOPT(int2, int32_t, |, |=)
 73 |     ML_OP_UNOPT(int2, int32_t, ^, ^=)
 74 | };
 75 | 
 76 | ML_INLINE int2 min(const int2& x, const int2& y) {
 77 |     return int2(min(x.x, y.x), min(x.y, y.y));
 78 | }
 79 | 
 80 | ML_INLINE int2 max(const int2& x, const int2& y) {
 81 |     return int2(max(x.x, y.x), max(x.y, y.y));
 82 | }
 83 | 
 84 | //======================================================================================================================
 85 | // int3
 86 | //======================================================================================================================
 87 | 
 88 | union int3 {
 89 |     v4i xmm;
 90 | 
 91 |     struct {
 92 |         int32_t a[COORD_3D];
 93 |     };
 94 | 
 95 |     struct {
 96 |         int32_t x, y, z;
 97 |     };
 98 | 
 99 |     ML_SWIZZLE_3(v4i_swizzle2, int2, v4i_swizzle3, int3);
100 | 
101 | public:
102 |     ML_INLINE int3()
103 |         : xmm(_mm_setzero_si128()) {
104 |     }
105 | 
106 |     ML_INLINE int3(int32_t c)
107 |         : xmm(_mm_set1_epi32(c)) {
108 |     }
109 | 
110 |     ML_INLINE int3(int32_t _x, int32_t _y, int32_t _z)
111 |         : xmm(v4i_set(_x, _y, _z, 1)) {
112 |     }
113 | 
114 |     ML_INLINE int3(const int2& v, int32_t _z)
115 |         : xmm(v4i_set(v.x, v.y, _z, 1)) {
116 |     }
117 | 
118 |     ML_INLINE int3(int32_t _x, const int2& v)
119 |         : xmm(v4i_set(_x, v.x, v.y, 1)) {
120 |     }
121 | 
122 |     ML_INLINE int3(const v4i& v)
123 |         : xmm(v) {
124 |     }
125 | 
126 |     ML_INLINE int3(const int32_t* v3)
127 |         : xmm(v4i_set(v3[0], v3[1], v3[2], 1)) {
128 |     }
129 | 
130 |     ML_INLINE int3(const int3& v) = default;
131 | 
132 |     // Set
133 | 
134 |     ML_INLINE void operator=(const int3& v) {
135 |         xmm = v.xmm;
136 |     }
137 | 
138 |     // Conversion
139 | 
140 |     ML_INLINE operator uint3() const;
141 |     ML_INLINE operator float3() const;
142 |     ML_INLINE operator double3() const;
143 | 
144 |     // Compare
145 | 
146 |     ML_COMPARE(bool3, int3, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm)
147 |     ML_COMPARE(bool3, int3, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm)
148 |     ML_COMPARE(bool3, int3, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm)
149 |     ML_COMPARE(bool3, int3, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm)
150 |     ML_COMPARE(bool3, int3, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm)
151 |     ML_COMPARE(bool3, int3, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm)
152 | 
153 |     // Ops
154 | 
155 |     ML_INLINE int3 operator-() const {
156 |         return _mm_xor_si128(xmm, _mm_set1_epi32(0x80000000));
157 |     }
158 | 
159 |     ML_OP(int3, int32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm)
160 |     ML_OP(int3, int32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm)
161 |     ML_OP(int3, int32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm)
162 |     ML_OP(int3, int32_t, /, /=, _mm_div_epi32, _mm_set1_epi32, xmm)
163 |     ML_OP(int3, int32_t, %, %=, v4i_mod, _mm_set1_epi32, xmm)
164 |     ML_OP(int3, int32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm)
165 |     ML_OP(int3, int32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm)
166 |     ML_OP(int3, int32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm)
167 |     ML_OP(int3, int32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm)
168 |     ML_OP(int3, int32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm)
169 | 
170 |     // Misc
171 | 
172 |     ML_INLINE operator v4i() const {
173 |         return xmm;
174 |     }
175 | 
176 |     static ML_INLINE int3 Zero() {
177 |         return _mm_setzero_si128();
178 |     }
179 | };
180 | 
181 | ML_INLINE int3 min(const int3& x, const int3& y) {
182 |     return _mm_min_epi32(x.xmm, y.xmm);
183 | }
184 | 
185 | ML_INLINE int3 max(const int3& x, const int3& y) {
186 |     return _mm_max_epi32(x.xmm, y.xmm);
187 | }
188 | 
189 | //======================================================================================================================
190 | // int4
191 | //======================================================================================================================
192 | 
193 | union int4 {
194 |     v4i xmm;
195 | 
196 |     struct {
197 |         int32_t a[COORD_4D];
198 |     };
199 | 
200 |     struct {
201 |         int32_t x, y, z, w;
202 |     };
203 | 
204 |     ML_SWIZZLE_4(v4i_swizzle2, int2, v4i_swizzle3, int3, v4i_swizzle4, int4);
205 | 
206 | public:
207 |     ML_INLINE int4()
208 |         : xmm(_mm_setzero_si128()) {
209 |     }
210 | 
211 |     ML_INLINE int4(int32_t c)
212 |         : xmm(_mm_set1_epi32(c)) {
213 |     }
214 | 
215 |     ML_INLINE int4(int32_t _x, int32_t _y, int32_t _z, int32_t _w)
216 |         : xmm(v4i_set(_x, _y, _z, _w)) {
217 |     }
218 | 
219 |     ML_INLINE int4(const int3& v, int32_t _w)
220 |         : xmm(v4i_set(v.x, v.y, v.z, _w)) {
221 |     }
222 | 
223 |     ML_INLINE int4(const int2& a, const int2& b)
224 |         : xmm(v4i_set(a.x, a.y, b.x, b.y)) {
225 |     }
226 | 
227 |     ML_INLINE int4(int32_t _x, const int3& v)
228 |         : xmm(v4i_set(_x, v.x, v.y, v.z)) {
229 |     }
230 | 
231 |     ML_INLINE int4(const v4i& v)
232 |         : xmm(v) {
233 |     }
234 | 
235 |     ML_INLINE int4(const int4& v) = default;
236 | 
237 |     // Set
238 | 
239 |     ML_INLINE void operator=(const int4& v) {
240 |         xmm = v.xmm;
241 |     }
242 | 
243 |     // Compare
244 | 
245 |     ML_COMPARE(bool4, int4, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm)
246 |     ML_COMPARE(bool4, int4, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm)
247 |     ML_COMPARE(bool4, int4, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm)
248 |     ML_COMPARE(bool4, int4, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm)
249 |     ML_COMPARE(bool4, int4, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm)
250 |     ML_COMPARE(bool4, int4, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm)
251 | 
252 |     // Conversion
253 | 
254 |     ML_INLINE operator uint4() const;
255 |     ML_INLINE operator float4() const;
256 |     ML_INLINE operator double4() const;
257 | 
258 |     // Ops
259 | 
260 |     ML_INLINE int4 operator-() const {
261 |         return _mm_xor_si128(xmm, _mm_set1_epi32(0x80000000));
262 |     }
263 | 
264 |     ML_OP(int4, int32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm)
265 |     ML_OP(int4, int32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm)
266 |     ML_OP(int4, int32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm)
267 |     ML_OP(int4, int32_t, /, /=, _mm_div_epi32, _mm_set1_epi32, xmm)
268 |     ML_OP(int4, int32_t, %, %=, v4i_mod, _mm_set1_epi32, xmm)
269 |     ML_OP(int4, int32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm)
270 |     ML_OP(int4, int32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm)
271 |     ML_OP(int4, int32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm)
272 |     ML_OP(int4, int32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm)
273 |     ML_OP(int4, int32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm)
274 | 
275 |     // Misc
276 | 
277 |     ML_INLINE operator v4i() const {
278 |         return xmm;
279 |     }
280 | 
281 |     static ML_INLINE int4 Zero() {
282 |         return _mm_setzero_si128();
283 |     }
284 | };
285 | 
286 | ML_INLINE int4 min(const int4& x, const int4& y) {
287 |     return _mm_min_epi32(x.xmm, y.xmm);
288 | }
289 | 
290 | ML_INLINE int4 max(const int4& x, const int4& y) {
291 |     return _mm_max_epi32(x.xmm, y.xmm);
292 | }
293 | 


--------------------------------------------------------------------------------
/Guts/other.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | //======================================================================================================================
  6 | // Other
  7 | //======================================================================================================================
  8 | 
  9 | ML_INLINE float SplitZ_Logarithmic(uint32_t i, uint32_t splits, float fZnear, float fZfar) {
 10 |     float ratio = fZfar / fZnear;
 11 |     float k = float(i) / float(splits);
 12 |     float z = fZnear * pow(ratio, k);
 13 | 
 14 |     return z;
 15 | }
 16 | 
 17 | ML_INLINE float SplitZ_Uniform(uint32_t i, uint32_t splits, float fZnear, float fZfar) {
 18 |     float delta = fZfar - fZnear;
 19 |     float k = float(i) / float(splits);
 20 |     float z = fZnear + delta * k;
 21 | 
 22 |     return z;
 23 | }
 24 | 
 25 | ML_INLINE float SplitZ_Mixed(uint32_t i, uint32_t splits, float fZnear, float fZfar, float lambda) {
 26 |     float z_log = SplitZ_Logarithmic(i, splits, fZnear, fZfar);
 27 |     float z_uni = SplitZ_Uniform(i, splits, fZnear, fZfar);
 28 |     float z = lerp(z_log, z_uni, lambda);
 29 | 
 30 |     return z;
 31 | }
 32 | 
 33 | ML_INLINE uint32_t GreatestCommonDivisor(uint32_t a, uint32_t b) {
 34 |     while (a && b) {
 35 |         if (a >= b)
 36 |             a = a % b;
 37 |         else
 38 |             b = b % a;
 39 |     }
 40 | 
 41 |     return a + b;
 42 | }
 43 | 
 44 | ML_INLINE uint32_t LeastCommonMultiple(uint32_t a, uint32_t b) {
 45 |     return (a * b) / GreatestCommonDivisor(a, b);
 46 | }
 47 | 
 48 | //======================================================================================================================
 49 | // Ray-Triangle/AABB
 50 | //======================================================================================================================
 51 | 
 52 | // NOTE: overlapping axis-aligned boundary box and triangle (center - aabb center, extents - half size)
 53 | // NOTE: AABB-triangle overlap test code by Tomas Akenine-Moller
 54 | //       http://fileadmin.cs.lth.se/cs/Personal/Tomas_Akenine-Moller/code/
 55 | //       SSE code from http://www.codercorner.com/blog/?p=1118
 56 | 
 57 | ML_INLINE uint32_t TestClassIII(const v4f& e0V, const v4f& v0V, const v4f& v1V, const v4f& v2V, const v4f& extents) {
 58 |     v4f fe0ZYX_V = v4f_abs(e0V);
 59 | 
 60 |     v4f e0XZY_V = v4f_swizzle(e0V, 1, 2, 0, 3);
 61 |     v4f v0XZY_V = v4f_swizzle(v0V, 1, 2, 0, 3);
 62 |     v4f v1XZY_V = v4f_swizzle(v1V, 1, 2, 0, 3);
 63 |     v4f v2XZY_V = v4f_swizzle(v2V, 1, 2, 0, 3);
 64 |     v4f fe0XZY_V = v4f_swizzle(fe0ZYX_V, 1, 2, 0, 3);
 65 |     v4f extentsXZY_V = v4f_swizzle(extents, 1, 2, 0, 3);
 66 | 
 67 |     v4f radV = _mm_add_ps(_mm_mul_ps(extents, fe0XZY_V), _mm_mul_ps(extentsXZY_V, fe0ZYX_V));
 68 |     v4f p0V = _mm_sub_ps(_mm_mul_ps(v0V, e0XZY_V), _mm_mul_ps(v0XZY_V, e0V));
 69 |     v4f p1V = _mm_sub_ps(_mm_mul_ps(v1V, e0XZY_V), _mm_mul_ps(v1XZY_V, e0V));
 70 |     v4f p2V = _mm_sub_ps(_mm_mul_ps(v2V, e0XZY_V), _mm_mul_ps(v2XZY_V, e0V));
 71 | 
 72 |     v4f minV = _mm_min_ps(_mm_min_ps(p0V, p1V), p2V);
 73 |     v4f maxV = _mm_max_ps(_mm_max_ps(p0V, p1V), p2V);
 74 | 
 75 |     uint32_t test = _mm_movemask_ps(_mm_cmpgt_ps(minV, radV));
 76 |     radV = _mm_sub_ps(_mm_setzero_ps(), radV);
 77 |     test |= _mm_movemask_ps(_mm_cmpgt_ps(radV, maxV));
 78 | 
 79 |     return test & 7;
 80 | }
 81 | 
 82 | ML_INLINE bool IsOverlapBoxTriangle(const float3& boxcenter, const float3& extents, const float3& p0, const float3& p1, const float3& p2) {
 83 |     v4f v0V = _mm_sub_ps(p0.xmm, boxcenter.xmm);
 84 |     v4f cV = v4f_abs(v0V);
 85 |     uint32_t test = _mm_movemask_ps(_mm_sub_ps(cV, extents.xmm));
 86 | 
 87 |     if ((test & 7) == 7)
 88 |         return true;
 89 | 
 90 |     v4f v1V = _mm_sub_ps(p1.xmm, boxcenter.xmm);
 91 |     v4f v2V = _mm_sub_ps(p2.xmm, boxcenter.xmm);
 92 |     v4f minV = _mm_min_ps(v0V, v1V);
 93 |     minV = _mm_min_ps(minV, v2V);
 94 |     test = _mm_movemask_ps(_mm_cmpgt_ps(minV, extents.xmm));
 95 | 
 96 |     if (test & 7)
 97 |         return false;
 98 | 
 99 |     v4f maxV = _mm_max_ps(v0V, v1V);
100 |     maxV = _mm_max_ps(maxV, v2V);
101 |     cV = _mm_sub_ps(_mm_setzero_ps(), extents.xmm);
102 |     test = _mm_movemask_ps(_mm_cmpgt_ps(cV, maxV));
103 | 
104 |     if (test & 7)
105 |         return false;
106 | 
107 |     v4f e0V = _mm_sub_ps(v1V, v0V);
108 |     v4f e1V = _mm_sub_ps(v2V, v1V);
109 |     v4f normalV = v4f_cross(e0V, e1V);
110 |     v4f dV = v4f_dot33(normalV, v0V);
111 | 
112 |     v4f normalSignsV = _mm_and_ps(normalV, c_v4f_Sign);
113 |     maxV = _mm_or_ps(extents.xmm, normalSignsV);
114 | 
115 |     v4f tmpV = v4f_dot33(normalV, maxV);
116 |     test = _mm_movemask_ps(_mm_cmpgt_ps(dV, tmpV));
117 | 
118 |     if (test & 7)
119 |         return false;
120 | 
121 |     normalSignsV = _mm_xor_ps(normalSignsV, c_v4f_Sign);
122 |     minV = _mm_or_ps(extents.xmm, normalSignsV);
123 | 
124 |     tmpV = v4f_dot33(normalV, minV);
125 |     test = _mm_movemask_ps(_mm_cmpgt_ps(tmpV, dV));
126 | 
127 |     if (test & 7)
128 |         return false;
129 | 
130 |     if (TestClassIII(e0V, v0V, v1V, v2V, extents.xmm))
131 |         return false;
132 | 
133 |     if (TestClassIII(e1V, v0V, v1V, v2V, extents.xmm))
134 |         return false;
135 | 
136 |     v4f e2V = _mm_sub_ps(v0V, v2V);
137 | 
138 |     if (TestClassIII(e2V, v0V, v1V, v2V, extents.xmm))
139 |         return false;
140 | 
141 |     return true;
142 | }
143 | 
144 | // NOTE: barycentric ray-triangle test by Tomas Akenine-Moller
145 | ML_INLINE bool IsIntersectRayTriangle(const float3& origin, const float3& dir, const float3& v1, const float3& v2, const float3& v3, float3& out_tuv) {
146 |     // find vectors for two edges sharing vert0
147 |     float3 e1 = v2 - v1;
148 |     float3 e2 = v3 - v1;
149 | 
150 |     // begin calculating determinant - also used to calculate U parameter
151 |     float3 pvec = cross(dir, e2);
152 | 
153 |     // if determinant is near zero, ray lies in plane of triangle
154 |     float det = dot(e1, pvec);
155 | 
156 |     if (det < -1e-6f)
157 |         return false;
158 | 
159 |     // calculate distance from vert0 to ray origin
160 |     float3 tvec = origin - v1;
161 | 
162 |     // calculate U parameter and test bounds
163 |     float u = dot(tvec, pvec);
164 | 
165 |     if (u < 0.0f || u > det)
166 |         return false;
167 | 
168 |     // prepare to test V parameter
169 |     float3 qvec = cross(tvec, e1);
170 | 
171 |     // calculate V parameter and test bounds
172 |     float v = dot(dir, qvec);
173 | 
174 |     if (v < 0.0f || u + v > det)
175 |         return false;
176 | 
177 |     // calculate t, scale parameters, ray intersects triangle
178 |     out_tuv.x = dot(e2, qvec);
179 |     out_tuv.y = u; // v
180 |     out_tuv.z = v; // 1 - (u + v)
181 | 
182 |     out_tuv /= det;
183 | 
184 |     return true;
185 | }
186 | 
187 | ML_INLINE bool IsIntersectRayTriangle(const float3& from, const float3& to, const float3& v1, const float3& v2, const float3& v3, float3& out_intersection, float3& out_normal) {
188 |     // find vectors for two edges sharing vert0
189 |     float3 e1 = v2 - v1;
190 |     float3 e2 = v3 - v1;
191 | 
192 |     // begin calculating determinant - also used to calculate U parameter
193 |     float3 dir = to - from;
194 |     float len = length(dir);
195 |     dir = normalize(dir);
196 | 
197 |     float3 pvec = cross(dir, e2);
198 | 
199 |     // if determinant is near zero, ray lies in plane of triangle
200 |     float det = dot(e1, pvec);
201 | 
202 |     if (det < -1e-6f)
203 |         return false;
204 | 
205 |     // calculate distance from vert0 to ray origin point "from"
206 |     float3 tvec = from - v1;
207 | 
208 |     // calculate U parameter and test bounds
209 |     float u = dot(tvec, pvec);
210 | 
211 |     if (u < 0.0f || u > det)
212 |         return false;
213 | 
214 |     // prepare to test V parameter
215 |     float3 qvec = cross(tvec, e1);
216 | 
217 |     // calculate V parameter and test bounds
218 |     float v = dot(dir, qvec);
219 | 
220 |     if (v < 0.0f || u + v > det)
221 |         return false;
222 | 
223 |     // calculate t, scale parameters, ray intersects triangle
224 |     float t = dot(e2, qvec) / det;
225 | 
226 |     if (t > len)
227 |         return false;
228 | 
229 |     out_intersection = from + dir * t;
230 |     out_normal = normalize(cross(e1, e2));
231 | 
232 |     return true;
233 | }
234 | 


--------------------------------------------------------------------------------
/Guts/packing.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | #define UF11_M_BITS 6
  6 | #define UF11_E_BITS 5
  7 | #define UF11_S_MASK 0x0
  8 | 
  9 | #define UF10_M_BITS 5
 10 | #define UF10_E_BITS 5
 11 | #define UF10_S_MASK 0x0
 12 | 
 13 | namespace Packing {
 14 | 
 15 | template <uint32_t Rbits, uint32_t Gbits, uint32_t Bbits, uint32_t Abits>
 16 | ML_INLINE uint32_t float4_to_unorm(const float4& v) {
 17 |     ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32");
 18 | 
 19 |     constexpr uint32_t Rmask = (1 << Rbits) - 1;
 20 |     constexpr uint32_t Gmask = (1 << Gbits) - 1;
 21 |     constexpr uint32_t Bmask = (1 << Bbits) - 1;
 22 |     constexpr uint32_t Amask = (1 << Abits) - 1;
 23 | 
 24 |     constexpr uint32_t Gshift = Rbits & 31;
 25 |     constexpr uint32_t Bshift = (Gshift + Gbits) & 31;
 26 |     constexpr uint32_t Ashift = (Bshift + Bbits) & 31;
 27 | 
 28 |     const v4f scale = v4f_set(float(Rmask), float(Gmask), float(Bmask), float(Amask));
 29 | 
 30 |     v4f t = _mm_mul_ps(v.xmm, scale);
 31 |     v4i i = _mm_cvtps_epi32(t);
 32 | 
 33 |     uint32_t p = _mm_cvtsi128_si32(i);
 34 |     p |= Gbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << Gshift);
 35 |     p |= Bbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(2, 2, 2, 2))) << Bshift);
 36 |     p |= Abits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(3, 3, 3, 3))) << Ashift);
 37 | 
 38 |     return p;
 39 | }
 40 | 
 41 | template <>
 42 | ML_INLINE uint32_t float4_to_unorm<8, 8, 8, 8>(const float4& v) {
 43 |     v4f t = _mm_mul_ps(v.xmm, _mm_set1_ps(255.0f));
 44 |     v4i i = _mm_cvtps_epi32(t);
 45 |     i = _mm_shuffle_epi8(i, _mm_set1_epi32(0x0C080400));
 46 | 
 47 |     return _mm_cvtsi128_si32(i);
 48 | }
 49 | 
 50 | ML_INLINE uint32_t float2_to_unorm_16_16(const float2& v) {
 51 |     v4f t = v4f_set(v.x, v.y, 0.0f, 0.0f);
 52 |     t = _mm_mul_ps(t, _mm_set1_ps(65535.0f));
 53 |     v4i i = _mm_cvtps_epi32(t);
 54 | 
 55 |     uint32_t p = _mm_cvtsi128_si32(i);
 56 |     p |= _mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << 16;
 57 | 
 58 |     return p;
 59 | }
 60 | 
 61 | ML_INLINE uint32_t float3_to_ufloat_11_11_10(const float3& v) {
 62 |     uint32_t r = ToSmallFloat<UF11_M_BITS, UF11_E_BITS, UF11_S_MASK>(v.x);
 63 |     r |= ToSmallFloat<UF11_M_BITS, UF11_E_BITS, UF11_S_MASK>(v.y) << 11;
 64 |     r |= ToSmallFloat<UF10_M_BITS, UF10_E_BITS, UF10_S_MASK>(v.z) << 22;
 65 | 
 66 |     return r;
 67 | }
 68 | 
 69 | template <uint32_t Rbits, uint32_t Gbits, uint32_t Bbits, uint32_t Abits>
 70 | ML_INLINE uint32_t float4_to_snorm(const float4& v) {
 71 |     ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32");
 72 | 
 73 |     constexpr uint32_t Rmask = (1 << Rbits) - 1;
 74 |     constexpr uint32_t Gmask = (1 << Gbits) - 1;
 75 |     constexpr uint32_t Bmask = (1 << Bbits) - 1;
 76 |     constexpr uint32_t Amask = (1 << Abits) - 1;
 77 | 
 78 |     constexpr uint32_t Gshift = Rbits & 31;
 79 |     constexpr uint32_t Bshift = (Gshift + Gbits) & 31;
 80 |     constexpr uint32_t Ashift = (Bshift + Bbits) & 31;
 81 | 
 82 |     constexpr float Rrange = (1 << (Rbits == 0 ? 1 : (Rbits - 1))) - 1;
 83 |     constexpr float Grange = (1 << (Gbits == 0 ? 1 : (Gbits - 1))) - 1;
 84 |     constexpr float Brange = (1 << (Bbits == 0 ? 1 : (Bbits - 1))) - 1;
 85 |     constexpr float Arange = (1 << (Abits == 0 ? 1 : (Abits - 1))) - 1;
 86 | 
 87 |     const v4f scale = v4f_set(Rrange, Grange, Brange, Arange);
 88 |     const v4i mask = _mm_setr_epi32(Rmask, Gmask, Bmask, Amask);
 89 | 
 90 |     v4f t = _mm_mul_ps(v.xmm, scale);
 91 |     v4i i = _mm_cvtps_epi32(t);
 92 |     i = _mm_and_si128(i, mask);
 93 | 
 94 |     uint32_t p = _mm_cvtsi128_si32(i);
 95 |     p |= Gbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << Gshift);
 96 |     p |= Bbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(2, 2, 2, 2))) << Bshift);
 97 |     p |= Abits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(3, 3, 3, 3))) << Ashift);
 98 | 
 99 |     return p;
100 | }
101 | 
102 | template <>
103 | ML_INLINE uint32_t float4_to_snorm<8, 8, 8, 8>(const float4& v) {
104 |     v4f t = _mm_mul_ps(v.xmm, _mm_set1_ps(127.0f));
105 |     v4i i = _mm_cvtps_epi32(t);
106 |     i = _mm_shuffle_epi8(i, _mm_set1_epi32(0x0C080400));
107 | 
108 |     return _mm_cvtsi128_si32(i);
109 | }
110 | 
111 | ML_INLINE uint32_t float2_to_snorm_16_16(const float2& v) {
112 |     v4f t = v4f_set(v.x, v.y, 0.0f, 0.0f);
113 |     t = _mm_mul_ps(t, _mm_set1_ps(32767.0f));
114 |     v4i i = _mm_cvtps_epi32(t);
115 |     i = _mm_and_si128(i, _mm_setr_epi32(65535, 65535, 0, 0));
116 | 
117 |     uint32_t p = _mm_cvtsi128_si32(i);
118 |     p |= _mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << 16;
119 | 
120 |     return p;
121 | }
122 | 
123 | ML_INLINE float16_t2 float2_to_float16_t2(const float2& v) {
124 |     float16_t2 r;
125 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
126 |     v4f t = v4f_set(v.x, v.y, 0.0f, 0.0f);
127 |     v4i p = v4f_to_h4(t);
128 | 
129 |     *((int32_t*)&r) = _mm_cvtsi128_si32(p);
130 | #else
131 |     r.x = float16_t(v.x);
132 |     r.y = float16_t(v.y);
133 | #endif
134 | 
135 |     return r;
136 | }
137 | 
138 | ML_INLINE float16_t4 float4_to_float16_t4(const float4& v) {
139 |     float16_t4 r;
140 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
141 |     v4i p = v4f_to_h4(v.xmm);
142 |     *((int64_t*)&r) = _mm_extract_epi64(p, 0);
143 | #else
144 |     float16_t2 xy = float2_to_float16_t2(v.xy);
145 |     float16_t2 zw = float2_to_float16_t2(v.zw);
146 | 
147 |     r = float16_t4(xy, zw);
148 | #endif
149 | 
150 |     return r;
151 | }
152 | 
153 | template <uint32_t Rbits, uint32_t Gbits, uint32_t Bbits, uint32_t Abits>
154 | ML_INLINE float4 unorm_to_float4(uint32_t p) {
155 |     ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32");
156 | 
157 |     constexpr uint32_t Rmask = (1 << Rbits) - 1;
158 |     constexpr uint32_t Gmask = (1 << Gbits) - 1;
159 |     constexpr uint32_t Bmask = (1 << Bbits) - 1;
160 |     constexpr uint32_t Amask = (1 << Abits) - 1;
161 | 
162 |     constexpr uint32_t Gshift = Rbits & 31;
163 |     constexpr uint32_t Bshift = (Gshift + Gbits) & 31;
164 |     constexpr uint32_t Ashift = (Bshift + Bbits) & 31;
165 | 
166 |     constexpr float invRmask = Rmask == 0 ? 1.0f : 1.0f / Rmask;
167 |     constexpr float invGmask = Gmask == 0 ? 1.0f : 1.0f / Gmask;
168 |     constexpr float invBmask = Bmask == 0 ? 1.0f : 1.0f / Bmask;
169 |     constexpr float invAmask = Amask == 0 ? 1.0f : 1.0f / Amask;
170 | 
171 |     const v4f scale = v4f_set(invRmask, invGmask, invBmask, invAmask);
172 | 
173 |     v4i i = _mm_setr_epi32(p & Rmask, (p >> Gshift) & Gmask, (p >> Bshift) & Bmask, (p >> Ashift) & Amask);
174 |     v4f t = _mm_cvtepi32_ps(i);
175 |     t = _mm_mul_ps(t, scale);
176 | 
177 |     return t;
178 | }
179 | 
180 | template <>
181 | ML_INLINE float4 unorm_to_float4<8, 8, 8, 8>(uint32_t p) {
182 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_SSE4)
183 |     v4i i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
184 | #else
185 |     v4i i = _mm_set_epi32(p >> 24, (p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF);
186 | #endif
187 | 
188 |     v4f t = _mm_cvtepi32_ps(i);
189 |     t = _mm_mul_ps(t, _mm_set1_ps(1.0f / 255.0f));
190 | 
191 |     return t;
192 | }
193 | 
194 | ML_INLINE float3 ufloat_11_11_10_to_float3(uint32_t p) {
195 |     float3 v;
196 |     v.x = FromSmallFloat<UF11_M_BITS, UF11_E_BITS, UF11_S_MASK>(p & ((1 << 11) - 1));
197 |     v.y = FromSmallFloat<UF11_M_BITS, UF11_E_BITS, UF11_S_MASK>((p >> 11) & ((1 << 11) - 1));
198 |     v.z = FromSmallFloat<UF10_M_BITS, UF10_E_BITS, UF10_S_MASK>((p >> 22) & ((1 << 10) - 1));
199 | 
200 |     return v;
201 | }
202 | 
203 | template <uint32_t Rbits, uint32_t Gbits, uint32_t Bbits, uint32_t Abits>
204 | ML_INLINE float4 snorm_to_float4(uint32_t p) {
205 |     ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32");
206 | 
207 |     constexpr uint32_t Rmask = (1 << Rbits) - 1;
208 |     constexpr uint32_t Gmask = (1 << Gbits) - 1;
209 |     constexpr uint32_t Bmask = (1 << Bbits) - 1;
210 |     constexpr uint32_t Amask = (1 << Abits) - 1;
211 | 
212 |     constexpr uint32_t Gshift = Rbits & 31;
213 |     constexpr uint32_t Bshift = (Gshift + Gbits) & 31;
214 |     constexpr uint32_t Ashift = (Bshift + Bbits) & 31;
215 | 
216 |     constexpr uint32_t Rsign = 1 << (Rbits == 0 ? 0 : (Rbits - 1));
217 |     constexpr uint32_t Gsign = 1 << (Gbits == 0 ? 0 : (Gbits - 1));
218 |     constexpr uint32_t Bsign = 1 << (Bbits == 0 ? 0 : (Bbits - 1));
219 |     constexpr uint32_t Asign = 1 << (Abits == 0 ? 0 : (Abits - 1));
220 | 
221 |     constexpr float invRsignMinus1 = Rbits == 0 ? 1.0f : 1.0f / (Rsign - 1);
222 |     constexpr float invGsignMinus1 = Gbits == 0 ? 1.0f : 1.0f / (Gsign - 1);
223 |     constexpr float invBsignMinus1 = Bbits == 0 ? 1.0f : 1.0f / (Bsign - 1);
224 |     constexpr float invAsignMinus1 = Abits == 0 ? 1.0f : 1.0f / (Asign - 1);
225 | 
226 |     const v4i vsign = _mm_setr_epi32(Rsign, Gsign, Bsign, Asign);
227 |     const v4i vor = _mm_setr_epi32(~(Rsign - 1), ~(Gsign - 1), ~(Bsign - 1), ~(Asign - 1));
228 |     const v4f vscale = v4f_set(invRsignMinus1, invGsignMinus1, invBsignMinus1, invAsignMinus1);
229 | 
230 |     v4i i = _mm_setr_epi32(p & Rmask, (p >> Gshift) & Gmask, (p >> Bshift) & Bmask, (p >> Ashift) & Amask);
231 | 
232 |     v4i mask = _mm_and_si128(i, vsign);
233 |     v4i ii = _mm_or_si128(i, vor);
234 |     i = v4i_select(i, ii, _mm_cmpeq_epi32(mask, _mm_setzero_si128()));
235 | 
236 |     v4f t = _mm_cvtepi32_ps(i);
237 |     t = _mm_mul_ps(t, vscale);
238 |     t = _mm_max_ps(t, _mm_set1_ps(-1.0f));
239 | 
240 |     return t;
241 | }
242 | 
243 | template <>
244 | ML_INLINE float4 snorm_to_float4<8, 8, 8, 8>(uint32_t p) {
245 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_SSE4)
246 |     v4i i = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(p));
247 | #else
248 |     v4i i = _mm_set_epi32(int8_t(p >> 24), int8_t((p >> 16) & 0xFF), int8_t((p >> 8) & 0xFF), int8_t(p & 0xFF));
249 | #endif
250 | 
251 |     v4f t = _mm_cvtepi32_ps(i);
252 |     t = _mm_mul_ps(t, _mm_set1_ps(1.0f / 127.0f));
253 |     t = _mm_max_ps(t, _mm_set1_ps(-1.0f));
254 | 
255 |     return t;
256 | }
257 | 
258 | ML_INLINE float2 float16_t2_to_float2(const float16_t2& p) {
259 |     float2 r;
260 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
261 |     v4i t = _mm_cvtsi32_si128(*(int32_t*)&p);
262 |     v4f f = _mm_cvtph_ps(t);
263 | 
264 |     _mm_storel_pi((__m64*)&r.mm, f);
265 | #else
266 |     r.x = float(p.x);
267 |     r.y = float(p.y);
268 | #endif
269 | 
270 |     return r;
271 | }
272 | 
273 | ML_INLINE float4 float16_t4_to_float4(const float16_t4& p) {
274 |     float4 f;
275 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
276 |     v4i t = _mm_loadu_si64(&p);
277 |     f.xmm = _mm_cvtph_ps(t);
278 | #else
279 |     f.x = float(p.x);
280 |     f.y = float(p.y);
281 |     f.z = float(p.z);
282 |     f.w = float(p.w);
283 | #endif
284 | 
285 |     return f;
286 | }
287 | 
288 | } // namespace Packing
289 | 


--------------------------------------------------------------------------------
/Guts/sorting.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | // NOTE: returns p1 < p2 ? -1 : (p1 > p2 ? 1 : 0)
  6 | 
  7 | typedef int32_t (*pfn_cmp_qsort)(const void* p1, const void* p2);
  8 | 
  9 | // NOTE: true - swap, false - keep; a - left, b - right
 10 | 
 11 | template <class T>
 12 | inline bool Sort_default_less(const T& a, const T& b) {
 13 |     return a < b;
 14 | }
 15 | 
 16 | template <class T>
 17 | inline bool Sort_default_greater(const T& a, const T& b) {
 18 |     return a > b;
 19 | }
 20 | 
 21 | /*
 22 | bool Sort_multikey(const T& a, const T& b)
 23 | {
 24 |     if( a.property1 > b.property1 )
 25 |         return true;
 26 | 
 27 |     if( a.property1 == b.property1 )
 28 |     {
 29 |         if( a.property2 > b.property2 )
 30 |             return true;
 31 | 
 32 |         if( a.property2 == b.property2 )
 33 |             return a.property3 > b.property3;
 34 |     }
 35 | 
 36 |     return false;
 37 | }
 38 | */
 39 | 
 40 | // NOTE: heap sort
 41 | // memory:      O(1)
 42 | // random:      +40% vs qsort
 43 | // sorted:      -30% vs qsort
 44 | // reversed:    -30% vs qsort
 45 | 
 46 | template <class T, bool (*cmp)(const T& a, const T& b)>
 47 | void Sort_heap(T* a, uint32_t n) {
 48 |     if (n < 2)
 49 |         return;
 50 | 
 51 |     uint32_t i = n >> 1;
 52 | 
 53 |     for (;;) {
 54 |         T t;
 55 | 
 56 |         if (i > 0)
 57 |             t = a[--i];
 58 |         else {
 59 |             if (--n == 0)
 60 |                 return;
 61 | 
 62 |             t = a[n];
 63 |             a[n] = a[0];
 64 |         }
 65 | 
 66 |         uint32_t parent = i;
 67 |         uint32_t child = (i << 1) + 1;
 68 | 
 69 |         while (child < n) {
 70 |             if (child + 1 < n && cmp(a[child], a[child + 1]))
 71 |                 child++;
 72 | 
 73 |             if (cmp(t, a[child])) {
 74 |                 a[parent] = a[child];
 75 | 
 76 |                 parent = child;
 77 |                 child = (parent << 1) + 1;
 78 |             } else
 79 |                 break;
 80 |         }
 81 | 
 82 |         a[parent] = t;
 83 |     }
 84 | }
 85 | 
 86 | // NOTE: merge sort
 87 | // memory:      O(n), t - temp array, return pointer to sorted array (can be a or t)
 88 | // random:      +130% vs qsort
 89 | // sorted:      +35% vs qsort
 90 | // reversed:    +40% vs qsort
 91 | 
 92 | template <class T, bool (*cmp)(const T& a, const T& b)>
 93 | T* Sort_merge(T* t, T* a, uint32_t n) {
 94 |     if (n < 2)
 95 |         return a;
 96 | 
 97 |     uint32_t n2 = n << 1;
 98 | 
 99 |     for (uint32_t size = 2; size < n2; size <<= 1) {
100 |         T* tmp = t;
101 | 
102 |         for (uint32_t i = 0; i < n; i += size) {
103 |             uint32_t j = i;
104 |             uint32_t nj = i + (size >> 1);
105 | 
106 |             if (nj > n)
107 |                 nj = n;
108 | 
109 |             uint32_t k = nj;
110 |             uint32_t nk = i + size;
111 | 
112 |             if (nk > n)
113 |                 nk = n;
114 | 
115 |             while (j < nj && k < nk)
116 |                 *tmp++ = cmp(a[j], a[k]) ? a[j++] : a[k++];
117 | 
118 |             nj -= j;
119 |             nk -= k;
120 | 
121 |             if (nj) {
122 |                 memcpy(tmp, a + j, nj * sizeof(T));
123 |                 tmp += nj;
124 |             }
125 | 
126 |             if (nk) {
127 |                 memcpy(tmp, a + k, nk * sizeof(T));
128 |                 tmp += nk;
129 |             }
130 |         }
131 | 
132 |         tmp = a;
133 |         a = t;
134 |         t = tmp;
135 |     }
136 | 
137 |     return a;
138 | }
139 | 


--------------------------------------------------------------------------------
/Guts/swizzle.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | #define ML_X 0
  6 | #define ML_Y 1
  7 | #define ML_Z 2
  8 | #define ML_W 3
  9 | 
 10 | template <class C, typename T, uint32_t... Indices>
 11 | class swizzle {
 12 | private:
 13 |     // Based on: https://kiorisyshen.github.io/2018/08/27/Vector%20Swizzling%20and%20Parameter%20Pack%20in%20C++/
 14 |     T a[sizeof...(Indices)];
 15 | 
 16 | public:
 17 |     static constexpr uint32_t i[] = {Indices...};
 18 |     static constexpr size_t N = sizeof...(Indices);
 19 | 
 20 |     ML_INLINE void operator=(const C& rhs) {
 21 |         for (size_t n = 0; n < N; n++)
 22 |             a[i[n]] = rhs[n];
 23 |     }
 24 | 
 25 |     ML_INLINE operator C() const {
 26 |         return C(a[Indices]...);
 27 |     }
 28 | };
 29 | 
 30 | // Swizzle ops
 31 | 
 32 | #define ML_SWIZZLE_2_OP(op, f, swizzle) \
 33 |     ML_INLINE void operator op(const C& v) { \
 34 |         ML_StaticAssertMsg(X != Y, "Wrong swizzle in " ML_Stringify(op)); \
 35 |         a[X] op v.x; \
 36 |         a[Y] op v.y; \
 37 |     }
 38 | 
 39 | #define ML_SWIZZLE_3_OP(op, f, swizzle) \
 40 |     ML_INLINE void operator op(const C& v) { \
 41 |         ML_StaticAssertMsg(X != Y && Y != Z && Z != X, "Wrong swizzle in " ML_Stringify(op)); \
 42 |         a[X] op v.x; \
 43 |         a[Y] op v.y; \
 44 |         a[Z] op v.z; \
 45 |     }
 46 | 
 47 | #if 0
 48 | #    define ML_SWIZZLE_4_OP(op, f, swizzle) \
 49 |         ML_INLINE void operator op(const C& v) { \
 50 |             ML_StaticAssertMsg(X + Y + Z + W == 6, "Wrong swizzle in " ML_Stringify(op)); \
 51 |             a[X] op v.x; \
 52 |             a[Y] op v.y; \
 53 |             a[Z] op v.z; \
 54 |             a[W] op v.w; \
 55 |         }
 56 | #else
 57 | #    define ML_SWIZZLE_4_OP(op, f, swizzle) \
 58 |         ML_INLINE void operator op(const C& v) { \
 59 |             ML_StaticAssertMsg(X + Y + Z + W == 6, "Wrong swizzle in " ML_Stringify(op)); \
 60 |             vec = f(swizzle(vec, X, Y, Z, W), v); \
 61 |         }
 62 | #endif
 63 | 
 64 | // v4i
 65 | 
 66 | template <class C, uint32_t X, uint32_t Y>
 67 | class v4i_swizzle2 {
 68 | private:
 69 |     union {
 70 |         struct {
 71 |             v4i vec;
 72 |         };
 73 | 
 74 |         struct {
 75 |             int32_t a[COORD_4D];
 76 |         };
 77 |     };
 78 | 
 79 | public:
 80 |     // Read-only: fast
 81 |     ML_INLINE operator C() const {
 82 |         return C(a[X], a[Y]);
 83 |     }
 84 | 
 85 |     // Read-write: most likely slow
 86 |     ML_SWIZZLE_2_OP(=, _mm_copy, v4i_swizzle)
 87 |     ML_SWIZZLE_2_OP(-=, _mm_sub_epi32, v4i_swizzle)
 88 |     ML_SWIZZLE_2_OP(+=, _mm_add_epi32, v4i_swizzle)
 89 |     ML_SWIZZLE_2_OP(*=, _mm_mullo_epi32, v4i_swizzle)
 90 |     ML_SWIZZLE_2_OP(/=, _mm_div_epi32, v4i_swizzle)
 91 |     ML_SWIZZLE_2_OP(%=, v4i_mod, v4i_swizzle)
 92 |     ML_SWIZZLE_2_OP(<<=, _mm_sllv_epi32, v4i_swizzle)
 93 |     ML_SWIZZLE_2_OP(>>=, _mm_srlv_epi32, v4i_swizzle)
 94 |     ML_SWIZZLE_2_OP(&=, _mm_and_si128, v4i_swizzle)
 95 |     ML_SWIZZLE_2_OP(|=, _mm_or_si128, v4i_swizzle)
 96 |     ML_SWIZZLE_2_OP(^=, _mm_xor_si128, v4i_swizzle)
 97 | };
 98 | 
 99 | template <class C, uint32_t X, uint32_t Y, uint32_t Z>
100 | class v4i_swizzle3 {
101 | private:
102 |     union {
103 |         struct {
104 |             v4i vec;
105 |         };
106 | 
107 |         struct {
108 |             int32_t a[COORD_4D];
109 |         };
110 |     };
111 | 
112 | public:
113 |     // Read-only: fast
114 |     ML_INLINE operator C() const {
115 |         return v4i_swizzle(vec, X, Y, Z, 3);
116 |     }
117 | 
118 |     // Read-write: most likely slow
119 |     ML_SWIZZLE_3_OP(=, _mm_copy, v4i_swizzle)
120 |     ML_SWIZZLE_3_OP(-=, _mm_sub_epi32, v4i_swizzle)
121 |     ML_SWIZZLE_3_OP(+=, _mm_add_epi32, v4i_swizzle)
122 |     ML_SWIZZLE_3_OP(*=, _mm_mullo_epi32, v4i_swizzle)
123 |     ML_SWIZZLE_3_OP(/=, _mm_div_epi32, v4i_swizzle)
124 |     ML_SWIZZLE_3_OP(%=, v4i_mod, v4i_swizzle)
125 |     ML_SWIZZLE_3_OP(<<=, _mm_sllv_epi32, v4i_swizzle)
126 |     ML_SWIZZLE_3_OP(>>=, _mm_srlv_epi32, v4i_swizzle)
127 |     ML_SWIZZLE_3_OP(&=, _mm_and_si128, v4i_swizzle)
128 |     ML_SWIZZLE_3_OP(|=, _mm_or_si128, v4i_swizzle)
129 |     ML_SWIZZLE_3_OP(^=, _mm_xor_si128, v4i_swizzle)
130 | };
131 | 
132 | template <class C, uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
133 | class v4i_swizzle4 {
134 | private:
135 |     union {
136 |         struct {
137 |             v4i vec;
138 |         };
139 | 
140 |         struct {
141 |             int32_t a[COORD_4D];
142 |         };
143 |     };
144 | 
145 | public:
146 |     // Read-only: fast
147 |     ML_INLINE operator C() const {
148 |         return v4i_swizzle(vec, X, Y, Z, W);
149 |     }
150 | 
151 |     // Read-write: most likely slow
152 |     ML_SWIZZLE_4_OP(=, _mm_copy, v4i_swizzle)
153 |     ML_SWIZZLE_4_OP(-=, _mm_sub_epi32, v4i_swizzle)
154 |     ML_SWIZZLE_4_OP(+=, _mm_add_epi32, v4i_swizzle)
155 |     ML_SWIZZLE_4_OP(*=, _mm_mullo_epi32, v4i_swizzle)
156 |     ML_SWIZZLE_4_OP(/=, _mm_div_epi32, v4i_swizzle)
157 |     ML_SWIZZLE_4_OP(%=, v4i_mod, v4i_swizzle)
158 |     ML_SWIZZLE_4_OP(<<=, _mm_sllv_epi32, v4i_swizzle)
159 |     ML_SWIZZLE_4_OP(>>=, _mm_srlv_epi32, v4i_swizzle)
160 |     ML_SWIZZLE_4_OP(&=, _mm_and_si128, v4i_swizzle)
161 |     ML_SWIZZLE_4_OP(|=, _mm_or_si128, v4i_swizzle)
162 |     ML_SWIZZLE_4_OP(^=, _mm_xor_si128, v4i_swizzle)
163 | };
164 | 
165 | // v4u
166 | 
167 | template <class C, uint32_t X, uint32_t Y>
168 | class v4u_swizzle2 {
169 | private:
170 |     union {
171 |         struct {
172 |             v4i vec;
173 |         };
174 | 
175 |         struct {
176 |             uint32_t a[COORD_4D];
177 |         };
178 |     };
179 | 
180 | public:
181 |     // Read-only: fast
182 |     ML_INLINE operator C() const {
183 |         return C(a[X], a[Y]);
184 |     }
185 | 
186 |     // Read-write: most likely slow
187 |     ML_SWIZZLE_2_OP(=, _mm_copy, v4i_swizzle)
188 |     ML_SWIZZLE_2_OP(-=, _mm_sub_epi32, v4i_swizzle)
189 |     ML_SWIZZLE_2_OP(+=, _mm_add_epi32, v4i_swizzle)
190 |     ML_SWIZZLE_2_OP(*=, _mm_mullo_epi32, v4i_swizzle)
191 |     ML_SWIZZLE_2_OP(/=, _mm_div_epu32, v4i_swizzle)
192 |     ML_SWIZZLE_2_OP(%=, v4u_mod, v4i_swizzle)
193 |     ML_SWIZZLE_2_OP(<<=, _mm_sllv_epi32, v4i_swizzle)
194 |     ML_SWIZZLE_2_OP(>>=, _mm_srlv_epi32, v4i_swizzle)
195 |     ML_SWIZZLE_2_OP(&=, _mm_and_si128, v4i_swizzle)
196 |     ML_SWIZZLE_2_OP(|=, _mm_or_si128, v4i_swizzle)
197 |     ML_SWIZZLE_2_OP(^=, _mm_xor_si128, v4i_swizzle)
198 | };
199 | 
200 | template <class C, uint32_t X, uint32_t Y, uint32_t Z>
201 | class v4u_swizzle3 {
202 | private:
203 |     union {
204 |         struct {
205 |             v4i vec;
206 |         };
207 | 
208 |         struct {
209 |             uint32_t a[COORD_4D];
210 |         };
211 |     };
212 | 
213 | public:
214 |     // Read-only: fast
215 |     ML_INLINE operator C() const {
216 |         return v4i_swizzle(vec, X, Y, Z, 3);
217 |     }
218 | 
219 |     // Read-write: most likely slow
220 |     ML_SWIZZLE_3_OP(=, _mm_copy, v4i_swizzle)
221 |     ML_SWIZZLE_3_OP(-=, _mm_sub_epi32, v4i_swizzle)
222 |     ML_SWIZZLE_3_OP(+=, _mm_add_epi32, v4i_swizzle)
223 |     ML_SWIZZLE_3_OP(*=, _mm_mullo_epi32, v4i_swizzle)
224 |     ML_SWIZZLE_3_OP(/=, _mm_div_epu32, v4i_swizzle)
225 |     ML_SWIZZLE_3_OP(%=, v4u_mod, v4i_swizzle)
226 |     ML_SWIZZLE_3_OP(<<=, _mm_sllv_epi32, v4i_swizzle)
227 |     ML_SWIZZLE_3_OP(>>=, _mm_srlv_epi32, v4i_swizzle)
228 |     ML_SWIZZLE_3_OP(&=, _mm_and_si128, v4i_swizzle)
229 |     ML_SWIZZLE_3_OP(|=, _mm_or_si128, v4i_swizzle)
230 |     ML_SWIZZLE_3_OP(^=, _mm_xor_si128, v4i_swizzle)
231 | };
232 | 
233 | template <class C, uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
234 | class v4u_swizzle4 {
235 | private:
236 |     union {
237 |         struct {
238 |             v4i vec;
239 |         };
240 | 
241 |         struct {
242 |             uint32_t a[COORD_4D];
243 |         };
244 |     };
245 | 
246 | public:
247 |     // Read-only: fast
248 |     ML_INLINE operator C() const {
249 |         return v4i_swizzle(vec, X, Y, Z, W);
250 |     }
251 | 
252 |     // Read-write: most likely slow
253 |     ML_SWIZZLE_4_OP(=, _mm_copy, v4i_swizzle)
254 |     ML_SWIZZLE_4_OP(-=, _mm_sub_epi32, v4i_swizzle)
255 |     ML_SWIZZLE_4_OP(+=, _mm_add_epi32, v4i_swizzle)
256 |     ML_SWIZZLE_4_OP(*=, _mm_mullo_epi32, v4i_swizzle)
257 |     ML_SWIZZLE_4_OP(/=, _mm_div_epu32, v4i_swizzle)
258 |     ML_SWIZZLE_4_OP(%=, v4u_mod, v4i_swizzle)
259 |     ML_SWIZZLE_4_OP(<<=, _mm_sllv_epi32, v4i_swizzle)
260 |     ML_SWIZZLE_4_OP(>>=, _mm_srlv_epi32, v4i_swizzle)
261 |     ML_SWIZZLE_4_OP(&=, _mm_and_si128, v4i_swizzle)
262 |     ML_SWIZZLE_4_OP(|=, _mm_or_si128, v4i_swizzle)
263 |     ML_SWIZZLE_4_OP(^=, _mm_xor_si128, v4i_swizzle)
264 | };
265 | 
266 | // v4f
267 | 
268 | template <class C, uint32_t X, uint32_t Y>
269 | class v4f_swizzle2 {
270 | private:
271 |     union {
272 |         struct {
273 |             v4f vec;
274 |         };
275 | 
276 |         struct {
277 |             float a[COORD_4D];
278 |         };
279 |     };
280 | 
281 | public:
282 |     // Read-only: fast
283 |     ML_INLINE operator C() const {
284 |         return C(a[X], a[Y]);
285 |     }
286 | 
287 |     // Read-write: most likely slow
288 |     ML_SWIZZLE_2_OP(=, _mm_copy, v4f_swizzle)
289 |     ML_SWIZZLE_2_OP(-=, _mm_sub_ps, v4f_swizzle)
290 |     ML_SWIZZLE_2_OP(+=, _mm_add_ps, v4f_swizzle)
291 |     ML_SWIZZLE_2_OP(*=, _mm_mul_ps, v4f_swizzle)
292 |     ML_SWIZZLE_2_OP(/=, _mm_div_ps, v4f_swizzle)
293 | };
294 | 
295 | template <class C, uint32_t X, uint32_t Y, uint32_t Z>
296 | class v4f_swizzle3 {
297 | private:
298 |     union {
299 |         struct {
300 |             v4f vec;
301 |         };
302 | 
303 |         struct {
304 |             float a[COORD_4D];
305 |         };
306 |     };
307 | 
308 | public:
309 |     // Read-only: fast
310 |     ML_INLINE operator C() const {
311 |         return v4f_swizzle(vec, X, Y, Z, 3);
312 |     }
313 | 
314 |     // Read-write: most likely slow
315 |     ML_SWIZZLE_3_OP(=, _mm_copy, v4f_swizzle)
316 |     ML_SWIZZLE_3_OP(-=, _mm_sub_ps, v4f_swizzle)
317 |     ML_SWIZZLE_3_OP(+=, _mm_add_ps, v4f_swizzle)
318 |     ML_SWIZZLE_3_OP(*=, _mm_mul_ps, v4f_swizzle)
319 |     ML_SWIZZLE_3_OP(/=, _mm_div_ps, v4f_swizzle)
320 | };
321 | 
322 | template <class C, uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
323 | class v4f_swizzle4 {
324 | private:
325 |     union {
326 |         struct {
327 |             v4f vec;
328 |         };
329 | 
330 |         struct {
331 |             float a[COORD_4D];
332 |         };
333 |     };
334 | 
335 | public:
336 |     // Read-only: fast
337 |     ML_INLINE operator C() const {
338 |         return v4f_swizzle(vec, X, Y, Z, W);
339 |     }
340 | 
341 |     // Read-write: most likely slow
342 |     ML_SWIZZLE_4_OP(=, _mm_copy, v4f_swizzle)
343 |     ML_SWIZZLE_4_OP(-=, _mm_sub_ps, v4f_swizzle)
344 |     ML_SWIZZLE_4_OP(+=, _mm_add_ps, v4f_swizzle)
345 |     ML_SWIZZLE_4_OP(*=, _mm_mul_ps, v4f_swizzle)
346 |     ML_SWIZZLE_4_OP(/=, _mm_div_ps, v4f_swizzle)
347 | };
348 | 
349 | // v4d
350 | 
351 | template <class C, uint32_t X, uint32_t Y>
352 | class v4d_swizzle2 {
353 | private:
354 |     union {
355 |         struct {
356 |             v4d vec;
357 |         };
358 | 
359 |         struct {
360 |             double a[COORD_4D];
361 |         };
362 |     };
363 | 
364 | public:
365 |     // Read-only: fast
366 |     ML_INLINE operator C() const {
367 |         return C(a[X], a[Y]);
368 |     }
369 | 
370 |     // Read-write: most likely slow
371 |     ML_SWIZZLE_2_OP(=, _mm_copy, v4d_swizzle)
372 |     ML_SWIZZLE_2_OP(-=, _mm256_sub_pd, v4d_swizzle)
373 |     ML_SWIZZLE_2_OP(+=, _mm256_add_pd, v4d_swizzle)
374 |     ML_SWIZZLE_2_OP(*=, _mm256_mul_pd, v4d_swizzle)
375 |     ML_SWIZZLE_2_OP(/=, _mm256_div_pd, v4d_swizzle)
376 | };
377 | 
378 | template <class C, uint32_t X, uint32_t Y, uint32_t Z>
379 | class v4d_swizzle3 {
380 | private:
381 |     union {
382 |         struct {
383 |             v4d vec;
384 |         };
385 | 
386 |         struct {
387 |             double a[COORD_4D];
388 |         };
389 |     };
390 | 
391 | public:
392 |     // Read-only: fast
393 |     ML_INLINE operator C() const {
394 |         return v4d_swizzle(vec, X, Y, Z, 3);
395 |     }
396 | 
397 |     // Read-write: most likely slow
398 |     ML_SWIZZLE_3_OP(=, _mm_copy, v4d_swizzle)
399 |     ML_SWIZZLE_3_OP(-=, _mm256_sub_pd, v4d_swizzle)
400 |     ML_SWIZZLE_3_OP(+=, _mm256_add_pd, v4d_swizzle)
401 |     ML_SWIZZLE_3_OP(*=, _mm256_mul_pd, v4d_swizzle)
402 |     ML_SWIZZLE_3_OP(/=, _mm256_div_pd, v4d_swizzle)
403 | };
404 | 
405 | template <class C, uint32_t X, uint32_t Y, uint32_t Z, uint32_t W>
406 | class v4d_swizzle4 {
407 | private:
408 |     union {
409 |         struct {
410 |             v4d vec;
411 |         };
412 | 
413 |         struct {
414 |             double a[COORD_4D];
415 |         };
416 |     };
417 | 
418 | public:
419 |     // Read-only: fast
420 |     ML_INLINE operator C() const {
421 |         return v4d_swizzle(vec, X, Y, Z, W);
422 |     }
423 | 
424 |     // Read-write: most likely slow
425 |     ML_SWIZZLE_4_OP(=, _mm_copy, v4d_swizzle)
426 |     ML_SWIZZLE_4_OP(-=, _mm256_sub_pd, v4d_swizzle)
427 |     ML_SWIZZLE_4_OP(+=, _mm256_add_pd, v4d_swizzle)
428 |     ML_SWIZZLE_4_OP(*=, _mm256_mul_pd, v4d_swizzle)
429 |     ML_SWIZZLE_4_OP(/=, _mm256_div_pd, v4d_swizzle)
430 | };
431 | 
432 | #undef ML_SWIZZLE_2_OP
433 | #undef ML_SWIZZLE_3_OP
434 | #undef ML_SWIZZLE_4_OP
435 | 
436 | // swizzles
437 | 
438 | #define ML_SWIZZLE_2(C, T) \
439 |     swizzle<C, T, ML_X, ML_X> xx; \
440 |     swizzle<C, T, ML_X, ML_Y> xy; \
441 |     swizzle<C, T, ML_Y, ML_X> yx; \
442 |     swizzle<C, T, ML_Y, ML_Y> yy
443 | 
444 | #define ML_SWIZZLE_3(S2, C2, S3, C3) \
445 |     S2<C2, ML_X, ML_X> xx; \
446 |     S2<C2, ML_X, ML_Y> xy; \
447 |     S2<C2, ML_X, ML_Z> xz; \
448 |     S2<C2, ML_Y, ML_X> yx; \
449 |     S2<C2, ML_Y, ML_Y> yy; \
450 |     S2<C2, ML_Y, ML_Z> yz; \
451 |     S2<C2, ML_Z, ML_X> zx; \
452 |     S2<C2, ML_Z, ML_Y> zy; \
453 |     S2<C2, ML_Z, ML_Z> zz; \
454 |     S3<C3, ML_X, ML_X, ML_X> xxx; \
455 |     S3<C3, ML_X, ML_X, ML_Y> xxy; \
456 |     S3<C3, ML_X, ML_X, ML_Z> xxz; \
457 |     S3<C3, ML_X, ML_Y, ML_X> xyx; \
458 |     S3<C3, ML_X, ML_Y, ML_Y> xyy; \
459 |     S3<C3, ML_X, ML_Y, ML_Z> xyz; \
460 |     S3<C3, ML_X, ML_Z, ML_X> xzx; \
461 |     S3<C3, ML_X, ML_Z, ML_Y> xzy; \
462 |     S3<C3, ML_X, ML_Z, ML_Z> xzz; \
463 |     S3<C3, ML_Y, ML_X, ML_X> yxx; \
464 |     S3<C3, ML_Y, ML_X, ML_Y> yxy; \
465 |     S3<C3, ML_Y, ML_X, ML_Z> yxz; \
466 |     S3<C3, ML_Y, ML_Y, ML_X> yyx; \
467 |     S3<C3, ML_Y, ML_Y, ML_Y> yyy; \
468 |     S3<C3, ML_Y, ML_Y, ML_Z> yyz; \
469 |     S3<C3, ML_Y, ML_Z, ML_X> yzx; \
470 |     S3<C3, ML_Y, ML_Z, ML_Y> yzy; \
471 |     S3<C3, ML_Y, ML_Z, ML_Z> yzz; \
472 |     S3<C3, ML_Z, ML_X, ML_X> zxx; \
473 |     S3<C3, ML_Z, ML_X, ML_Y> zxy; \
474 |     S3<C3, ML_Z, ML_X, ML_Z> zxz; \
475 |     S3<C3, ML_Z, ML_Y, ML_X> zyx; \
476 |     S3<C3, ML_Z, ML_Y, ML_Y> zyy; \
477 |     S3<C3, ML_Z, ML_Y, ML_Z> zyz; \
478 |     S3<C3, ML_Z, ML_Z, ML_X> zzx; \
479 |     S3<C3, ML_Z, ML_Z, ML_Y> zzy; \
480 |     S3<C3, ML_Z, ML_Z, ML_Z> zzz
481 | 
482 | #define ML_SWIZZLE_4(S2, C2, S3, C3, S4, C4) \
483 |     S2<C2, ML_X, ML_X> xx; \
484 |     S2<C2, ML_X, ML_Y> xy; \
485 |     S2<C2, ML_X, ML_Z> xz; \
486 |     S2<C2, ML_X, ML_W> xw; \
487 |     S2<C2, ML_Y, ML_X> yx; \
488 |     S2<C2, ML_Y, ML_Y> yy; \
489 |     S2<C2, ML_Y, ML_Z> yz; \
490 |     S2<C2, ML_Y, ML_W> yw; \
491 |     S2<C2, ML_Z, ML_X> zx; \
492 |     S2<C2, ML_Z, ML_Y> zy; \
493 |     S2<C2, ML_Z, ML_Z> zz; \
494 |     S2<C2, ML_Z, ML_W> zw; \
495 |     S2<C2, ML_W, ML_X> wx; \
496 |     S2<C2, ML_W, ML_Y> wy; \
497 |     S2<C2, ML_W, ML_Z> wz; \
498 |     S2<C2, ML_W, ML_W> ww; \
499 |     S3<C3, ML_X, ML_X, ML_X> xxx; \
500 |     S3<C3, ML_X, ML_X, ML_Y> xxy; \
501 |     S3<C3, ML_X, ML_X, ML_Z> xxz; \
502 |     S3<C3, ML_X, ML_X, ML_W> xxw; \
503 |     S3<C3, ML_X, ML_Y, ML_X> xyx; \
504 |     S3<C3, ML_X, ML_Y, ML_Y> xyy; \
505 |     S3<C3, ML_X, ML_Y, ML_Z> xyz; \
506 |     S3<C3, ML_X, ML_Y, ML_W> xyw; \
507 |     S3<C3, ML_X, ML_Z, ML_X> xzx; \
508 |     S3<C3, ML_X, ML_Z, ML_Y> xzy; \
509 |     S3<C3, ML_X, ML_Z, ML_Z> xzz; \
510 |     S3<C3, ML_X, ML_Z, ML_W> xzw; \
511 |     S3<C3, ML_X, ML_W, ML_X> xwx; \
512 |     S3<C3, ML_X, ML_W, ML_Y> xwy; \
513 |     S3<C3, ML_X, ML_W, ML_Z> xwz; \
514 |     S3<C3, ML_X, ML_W, ML_W> xww; \
515 |     S3<C3, ML_Y, ML_X, ML_X> yxx; \
516 |     S3<C3, ML_Y, ML_X, ML_Y> yxy; \
517 |     S3<C3, ML_Y, ML_X, ML_Z> yxz; \
518 |     S3<C3, ML_Y, ML_X, ML_W> yxw; \
519 |     S3<C3, ML_Y, ML_Y, ML_X> yyx; \
520 |     S3<C3, ML_Y, ML_Y, ML_Y> yyy; \
521 |     S3<C3, ML_Y, ML_Y, ML_Z> yyz; \
522 |     S3<C3, ML_Y, ML_Y, ML_W> yyw; \
523 |     S3<C3, ML_Y, ML_Z, ML_X> yzx; \
524 |     S3<C3, ML_Y, ML_Z, ML_Y> yzy; \
525 |     S3<C3, ML_Y, ML_Z, ML_Z> yzz; \
526 |     S3<C3, ML_Y, ML_Z, ML_W> yzw; \
527 |     S3<C3, ML_Y, ML_W, ML_X> ywx; \
528 |     S3<C3, ML_Y, ML_W, ML_Y> ywy; \
529 |     S3<C3, ML_Y, ML_W, ML_Z> ywz; \
530 |     S3<C3, ML_Y, ML_W, ML_W> yww; \
531 |     S3<C3, ML_Z, ML_X, ML_X> zxx; \
532 |     S3<C3, ML_Z, ML_X, ML_Y> zxy; \
533 |     S3<C3, ML_Z, ML_X, ML_Z> zxz; \
534 |     S3<C3, ML_Z, ML_X, ML_W> zxw; \
535 |     S3<C3, ML_Z, ML_Y, ML_X> zyx; \
536 |     S3<C3, ML_Z, ML_Y, ML_Y> zyy; \
537 |     S3<C3, ML_Z, ML_Y, ML_Z> zyz; \
538 |     S3<C3, ML_Z, ML_Y, ML_W> zyw; \
539 |     S3<C3, ML_Z, ML_Z, ML_X> zzx; \
540 |     S3<C3, ML_Z, ML_Z, ML_Y> zzy; \
541 |     S3<C3, ML_Z, ML_Z, ML_Z> zzz; \
542 |     S3<C3, ML_Z, ML_Z, ML_W> zzw; \
543 |     S3<C3, ML_Z, ML_W, ML_X> zwx; \
544 |     S3<C3, ML_Z, ML_W, ML_Y> zwy; \
545 |     S3<C3, ML_Z, ML_W, ML_Z> zwz; \
546 |     S3<C3, ML_Z, ML_W, ML_W> zww; \
547 |     S3<C3, ML_W, ML_X, ML_X> wxx; \
548 |     S3<C3, ML_W, ML_X, ML_Y> wxy; \
549 |     S3<C3, ML_W, ML_X, ML_Z> wxz; \
550 |     S3<C3, ML_W, ML_X, ML_W> wxw; \
551 |     S3<C3, ML_W, ML_Y, ML_X> wyx; \
552 |     S3<C3, ML_W, ML_Y, ML_Y> wyy; \
553 |     S3<C3, ML_W, ML_Y, ML_Z> wyz; \
554 |     S3<C3, ML_W, ML_Y, ML_W> wyw; \
555 |     S3<C3, ML_W, ML_Z, ML_X> wzx; \
556 |     S3<C3, ML_W, ML_Z, ML_Y> wzy; \
557 |     S3<C3, ML_W, ML_Z, ML_Z> wzz; \
558 |     S3<C3, ML_W, ML_Z, ML_W> wzw; \
559 |     S3<C3, ML_W, ML_W, ML_X> wwx; \
560 |     S3<C3, ML_W, ML_W, ML_Y> wwy; \
561 |     S3<C3, ML_W, ML_W, ML_Z> wwz; \
562 |     S3<C3, ML_W, ML_W, ML_W> www; \
563 |     S4<C4, ML_X, ML_X, ML_X, ML_X> xxxx; \
564 |     S4<C4, ML_X, ML_X, ML_X, ML_Y> xxxy; \
565 |     S4<C4, ML_X, ML_X, ML_X, ML_Z> xxxz; \
566 |     S4<C4, ML_X, ML_X, ML_X, ML_W> xxxw; \
567 |     S4<C4, ML_X, ML_X, ML_Y, ML_X> xxyx; \
568 |     S4<C4, ML_X, ML_X, ML_Y, ML_Y> xxyy; \
569 |     S4<C4, ML_X, ML_X, ML_Y, ML_Z> xxyz; \
570 |     S4<C4, ML_X, ML_X, ML_Y, ML_W> xxyw; \
571 |     S4<C4, ML_X, ML_X, ML_Z, ML_X> xxzx; \
572 |     S4<C4, ML_X, ML_X, ML_Z, ML_Y> xxzy; \
573 |     S4<C4, ML_X, ML_X, ML_Z, ML_Z> xxzz; \
574 |     S4<C4, ML_X, ML_X, ML_Z, ML_W> xxzw; \
575 |     S4<C4, ML_X, ML_X, ML_W, ML_X> xxwx; \
576 |     S4<C4, ML_X, ML_X, ML_W, ML_Y> xxwy; \
577 |     S4<C4, ML_X, ML_X, ML_W, ML_Z> xxwz; \
578 |     S4<C4, ML_X, ML_X, ML_W, ML_W> xxww; \
579 |     S4<C4, ML_X, ML_Y, ML_X, ML_X> xyxx; \
580 |     S4<C4, ML_X, ML_Y, ML_X, ML_Y> xyxy; \
581 |     S4<C4, ML_X, ML_Y, ML_X, ML_Z> xyxz; \
582 |     S4<C4, ML_X, ML_Y, ML_X, ML_W> xyxw; \
583 |     S4<C4, ML_X, ML_Y, ML_Y, ML_X> xyyx; \
584 |     S4<C4, ML_X, ML_Y, ML_Y, ML_Y> xyyy; \
585 |     S4<C4, ML_X, ML_Y, ML_Y, ML_Z> xyyz; \
586 |     S4<C4, ML_X, ML_Y, ML_Y, ML_W> xyyw; \
587 |     S4<C4, ML_X, ML_Y, ML_Z, ML_X> xyzx; \
588 |     S4<C4, ML_X, ML_Y, ML_Z, ML_Y> xyzy; \
589 |     S4<C4, ML_X, ML_Y, ML_Z, ML_Z> xyzz; \
590 |     S4<C4, ML_X, ML_Y, ML_Z, ML_W> xyzw; \
591 |     S4<C4, ML_X, ML_Y, ML_W, ML_X> xywx; \
592 |     S4<C4, ML_X, ML_Y, ML_W, ML_Y> xywy; \
593 |     S4<C4, ML_X, ML_Y, ML_W, ML_Z> xywz; \
594 |     S4<C4, ML_X, ML_Y, ML_W, ML_W> xyww; \
595 |     S4<C4, ML_X, ML_Z, ML_X, ML_X> xzxx; \
596 |     S4<C4, ML_X, ML_Z, ML_X, ML_Y> xzxy; \
597 |     S4<C4, ML_X, ML_Z, ML_X, ML_Z> xzxz; \
598 |     S4<C4, ML_X, ML_Z, ML_X, ML_W> xzxw; \
599 |     S4<C4, ML_X, ML_Z, ML_Y, ML_X> xzyx; \
600 |     S4<C4, ML_X, ML_Z, ML_Y, ML_Y> xzyy; \
601 |     S4<C4, ML_X, ML_Z, ML_Y, ML_Z> xzyz; \
602 |     S4<C4, ML_X, ML_Z, ML_Y, ML_W> xzyw; \
603 |     S4<C4, ML_X, ML_Z, ML_Z, ML_X> xzzx; \
604 |     S4<C4, ML_X, ML_Z, ML_Z, ML_Y> xzzy; \
605 |     S4<C4, ML_X, ML_Z, ML_Z, ML_Z> xzzz; \
606 |     S4<C4, ML_X, ML_Z, ML_Z, ML_W> xzzw; \
607 |     S4<C4, ML_X, ML_Z, ML_W, ML_X> xzwx; \
608 |     S4<C4, ML_X, ML_Z, ML_W, ML_Y> xzwy; \
609 |     S4<C4, ML_X, ML_Z, ML_W, ML_Z> xzwz; \
610 |     S4<C4, ML_X, ML_Z, ML_W, ML_W> xzww; \
611 |     S4<C4, ML_X, ML_W, ML_X, ML_X> xwxx; \
612 |     S4<C4, ML_X, ML_W, ML_X, ML_Y> xwxy; \
613 |     S4<C4, ML_X, ML_W, ML_X, ML_Z> xwxz; \
614 |     S4<C4, ML_X, ML_W, ML_X, ML_W> xwxw; \
615 |     S4<C4, ML_X, ML_W, ML_Y, ML_X> xwyx; \
616 |     S4<C4, ML_X, ML_W, ML_Y, ML_Y> xwyy; \
617 |     S4<C4, ML_X, ML_W, ML_Y, ML_Z> xwyz; \
618 |     S4<C4, ML_X, ML_W, ML_Y, ML_W> xwyw; \
619 |     S4<C4, ML_X, ML_W, ML_Z, ML_X> xwzx; \
620 |     S4<C4, ML_X, ML_W, ML_Z, ML_Y> xwzy; \
621 |     S4<C4, ML_X, ML_W, ML_Z, ML_Z> xwzz; \
622 |     S4<C4, ML_X, ML_W, ML_Z, ML_W> xwzw; \
623 |     S4<C4, ML_X, ML_W, ML_W, ML_X> xwwx; \
624 |     S4<C4, ML_X, ML_W, ML_W, ML_Y> xwwy; \
625 |     S4<C4, ML_X, ML_W, ML_W, ML_Z> xwwz; \
626 |     S4<C4, ML_X, ML_W, ML_W, ML_W> xwww; \
627 |     S4<C4, ML_Y, ML_X, ML_X, ML_X> yxxx; \
628 |     S4<C4, ML_Y, ML_X, ML_X, ML_Y> yxxy; \
629 |     S4<C4, ML_Y, ML_X, ML_X, ML_Z> yxxz; \
630 |     S4<C4, ML_Y, ML_X, ML_X, ML_W> yxxw; \
631 |     S4<C4, ML_Y, ML_X, ML_Y, ML_X> yxyx; \
632 |     S4<C4, ML_Y, ML_X, ML_Y, ML_Y> yxyy; \
633 |     S4<C4, ML_Y, ML_X, ML_Y, ML_Z> yxyz; \
634 |     S4<C4, ML_Y, ML_X, ML_Y, ML_W> yxyw; \
635 |     S4<C4, ML_Y, ML_X, ML_Z, ML_X> yxzx; \
636 |     S4<C4, ML_Y, ML_X, ML_Z, ML_Y> yxzy; \
637 |     S4<C4, ML_Y, ML_X, ML_Z, ML_Z> yxzz; \
638 |     S4<C4, ML_Y, ML_X, ML_Z, ML_W> yxzw; \
639 |     S4<C4, ML_Y, ML_X, ML_W, ML_X> yxwx; \
640 |     S4<C4, ML_Y, ML_X, ML_W, ML_Y> yxwy; \
641 |     S4<C4, ML_Y, ML_X, ML_W, ML_Z> yxwz; \
642 |     S4<C4, ML_Y, ML_X, ML_W, ML_W> yxww; \
643 |     S4<C4, ML_Y, ML_Y, ML_X, ML_X> yyxx; \
644 |     S4<C4, ML_Y, ML_Y, ML_X, ML_Y> yyxy; \
645 |     S4<C4, ML_Y, ML_Y, ML_X, ML_Z> yyxz; \
646 |     S4<C4, ML_Y, ML_Y, ML_X, ML_W> yyxw; \
647 |     S4<C4, ML_Y, ML_Y, ML_Y, ML_X> yyyx; \
648 |     S4<C4, ML_Y, ML_Y, ML_Y, ML_Y> yyyy; \
649 |     S4<C4, ML_Y, ML_Y, ML_Y, ML_Z> yyyz; \
650 |     S4<C4, ML_Y, ML_Y, ML_Y, ML_W> yyyw; \
651 |     S4<C4, ML_Y, ML_Y, ML_Z, ML_X> yyzx; \
652 |     S4<C4, ML_Y, ML_Y, ML_Z, ML_Y> yyzy; \
653 |     S4<C4, ML_Y, ML_Y, ML_Z, ML_Z> yyzz; \
654 |     S4<C4, ML_Y, ML_Y, ML_Z, ML_W> yyzw; \
655 |     S4<C4, ML_Y, ML_Y, ML_W, ML_X> yywx; \
656 |     S4<C4, ML_Y, ML_Y, ML_W, ML_Y> yywy; \
657 |     S4<C4, ML_Y, ML_Y, ML_W, ML_Z> yywz; \
658 |     S4<C4, ML_Y, ML_Y, ML_W, ML_W> yyww; \
659 |     S4<C4, ML_Y, ML_Z, ML_X, ML_X> yzxx; \
660 |     S4<C4, ML_Y, ML_Z, ML_X, ML_Y> yzxy; \
661 |     S4<C4, ML_Y, ML_Z, ML_X, ML_Z> yzxz; \
662 |     S4<C4, ML_Y, ML_Z, ML_X, ML_W> yzxw; \
663 |     S4<C4, ML_Y, ML_Z, ML_Y, ML_X> yzyx; \
664 |     S4<C4, ML_Y, ML_Z, ML_Y, ML_Y> yzyy; \
665 |     S4<C4, ML_Y, ML_Z, ML_Y, ML_Z> yzyz; \
666 |     S4<C4, ML_Y, ML_Z, ML_Y, ML_W> yzyw; \
667 |     S4<C4, ML_Y, ML_Z, ML_Z, ML_X> yzzx; \
668 |     S4<C4, ML_Y, ML_Z, ML_Z, ML_Y> yzzy; \
669 |     S4<C4, ML_Y, ML_Z, ML_Z, ML_Z> yzzz; \
670 |     S4<C4, ML_Y, ML_Z, ML_Z, ML_W> yzzw; \
671 |     S4<C4, ML_Y, ML_Z, ML_W, ML_X> yzwx; \
672 |     S4<C4, ML_Y, ML_Z, ML_W, ML_Y> yzwy; \
673 |     S4<C4, ML_Y, ML_Z, ML_W, ML_Z> yzwz; \
674 |     S4<C4, ML_Y, ML_Z, ML_W, ML_W> yzww; \
675 |     S4<C4, ML_Y, ML_W, ML_X, ML_X> ywxx; \
676 |     S4<C4, ML_Y, ML_W, ML_X, ML_Y> ywxy; \
677 |     S4<C4, ML_Y, ML_W, ML_X, ML_Z> ywxz; \
678 |     S4<C4, ML_Y, ML_W, ML_X, ML_W> ywxw; \
679 |     S4<C4, ML_Y, ML_W, ML_Y, ML_X> ywyx; \
680 |     S4<C4, ML_Y, ML_W, ML_Y, ML_Y> ywyy; \
681 |     S4<C4, ML_Y, ML_W, ML_Y, ML_Z> ywyz; \
682 |     S4<C4, ML_Y, ML_W, ML_Y, ML_W> ywyw; \
683 |     S4<C4, ML_Y, ML_W, ML_Z, ML_X> ywzx; \
684 |     S4<C4, ML_Y, ML_W, ML_Z, ML_Y> ywzy; \
685 |     S4<C4, ML_Y, ML_W, ML_Z, ML_Z> ywzz; \
686 |     S4<C4, ML_Y, ML_W, ML_Z, ML_W> ywzw; \
687 |     S4<C4, ML_Y, ML_W, ML_W, ML_X> ywwx; \
688 |     S4<C4, ML_Y, ML_W, ML_W, ML_Y> ywwy; \
689 |     S4<C4, ML_Y, ML_W, ML_W, ML_Z> ywwz; \
690 |     S4<C4, ML_Y, ML_W, ML_W, ML_W> ywww; \
691 |     S4<C4, ML_Z, ML_X, ML_X, ML_X> zxxx; \
692 |     S4<C4, ML_Z, ML_X, ML_X, ML_Y> zxxy; \
693 |     S4<C4, ML_Z, ML_X, ML_X, ML_Z> zxxz; \
694 |     S4<C4, ML_Z, ML_X, ML_X, ML_W> zxxw; \
695 |     S4<C4, ML_Z, ML_X, ML_Y, ML_X> zxyx; \
696 |     S4<C4, ML_Z, ML_X, ML_Y, ML_Y> zxyy; \
697 |     S4<C4, ML_Z, ML_X, ML_Y, ML_Z> zxyz; \
698 |     S4<C4, ML_Z, ML_X, ML_Y, ML_W> zxyw; \
699 |     S4<C4, ML_Z, ML_X, ML_Z, ML_X> zxzx; \
700 |     S4<C4, ML_Z, ML_X, ML_Z, ML_Y> zxzy; \
701 |     S4<C4, ML_Z, ML_X, ML_Z, ML_Z> zxzz; \
702 |     S4<C4, ML_Z, ML_X, ML_Z, ML_W> zxzw; \
703 |     S4<C4, ML_Z, ML_X, ML_W, ML_X> zxwx; \
704 |     S4<C4, ML_Z, ML_X, ML_W, ML_Y> zxwy; \
705 |     S4<C4, ML_Z, ML_X, ML_W, ML_Z> zxwz; \
706 |     S4<C4, ML_Z, ML_X, ML_W, ML_W> zxww; \
707 |     S4<C4, ML_Z, ML_Y, ML_X, ML_X> zyxx; \
708 |     S4<C4, ML_Z, ML_Y, ML_X, ML_Y> zyxy; \
709 |     S4<C4, ML_Z, ML_Y, ML_X, ML_Z> zyxz; \
710 |     S4<C4, ML_Z, ML_Y, ML_X, ML_W> zyxw; \
711 |     S4<C4, ML_Z, ML_Y, ML_Y, ML_X> zyyx; \
712 |     S4<C4, ML_Z, ML_Y, ML_Y, ML_Y> zyyy; \
713 |     S4<C4, ML_Z, ML_Y, ML_Y, ML_Z> zyyz; \
714 |     S4<C4, ML_Z, ML_Y, ML_Y, ML_W> zyyw; \
715 |     S4<C4, ML_Z, ML_Y, ML_Z, ML_X> zyzx; \
716 |     S4<C4, ML_Z, ML_Y, ML_Z, ML_Y> zyzy; \
717 |     S4<C4, ML_Z, ML_Y, ML_Z, ML_Z> zyzz; \
718 |     S4<C4, ML_Z, ML_Y, ML_Z, ML_W> zyzw; \
719 |     S4<C4, ML_Z, ML_Y, ML_W, ML_X> zywx; \
720 |     S4<C4, ML_Z, ML_Y, ML_W, ML_Y> zywy; \
721 |     S4<C4, ML_Z, ML_Y, ML_W, ML_Z> zywz; \
722 |     S4<C4, ML_Z, ML_Y, ML_W, ML_W> zyww; \
723 |     S4<C4, ML_Z, ML_Z, ML_X, ML_X> zzxx; \
724 |     S4<C4, ML_Z, ML_Z, ML_X, ML_Y> zzxy; \
725 |     S4<C4, ML_Z, ML_Z, ML_X, ML_Z> zzxz; \
726 |     S4<C4, ML_Z, ML_Z, ML_X, ML_W> zzxw; \
727 |     S4<C4, ML_Z, ML_Z, ML_Y, ML_X> zzyx; \
728 |     S4<C4, ML_Z, ML_Z, ML_Y, ML_Y> zzyy; \
729 |     S4<C4, ML_Z, ML_Z, ML_Y, ML_Z> zzyz; \
730 |     S4<C4, ML_Z, ML_Z, ML_Y, ML_W> zzyw; \
731 |     S4<C4, ML_Z, ML_Z, ML_Z, ML_X> zzzx; \
732 |     S4<C4, ML_Z, ML_Z, ML_Z, ML_Y> zzzy; \
733 |     S4<C4, ML_Z, ML_Z, ML_Z, ML_Z> zzzz; \
734 |     S4<C4, ML_Z, ML_Z, ML_Z, ML_W> zzzw; \
735 |     S4<C4, ML_Z, ML_Z, ML_W, ML_X> zzwx; \
736 |     S4<C4, ML_Z, ML_Z, ML_W, ML_Y> zzwy; \
737 |     S4<C4, ML_Z, ML_Z, ML_W, ML_Z> zzwz; \
738 |     S4<C4, ML_Z, ML_Z, ML_W, ML_W> zzww; \
739 |     S4<C4, ML_Z, ML_W, ML_X, ML_X> zwxx; \
740 |     S4<C4, ML_Z, ML_W, ML_X, ML_Y> zwxy; \
741 |     S4<C4, ML_Z, ML_W, ML_X, ML_Z> zwxz; \
742 |     S4<C4, ML_Z, ML_W, ML_X, ML_W> zwxw; \
743 |     S4<C4, ML_Z, ML_W, ML_Y, ML_X> zwyx; \
744 |     S4<C4, ML_Z, ML_W, ML_Y, ML_Y> zwyy; \
745 |     S4<C4, ML_Z, ML_W, ML_Y, ML_Z> zwyz; \
746 |     S4<C4, ML_Z, ML_W, ML_Y, ML_W> zwyw; \
747 |     S4<C4, ML_Z, ML_W, ML_Z, ML_X> zwzx; \
748 |     S4<C4, ML_Z, ML_W, ML_Z, ML_Y> zwzy; \
749 |     S4<C4, ML_Z, ML_W, ML_Z, ML_Z> zwzz; \
750 |     S4<C4, ML_Z, ML_W, ML_Z, ML_W> zwzw; \
751 |     S4<C4, ML_Z, ML_W, ML_W, ML_X> zwwx; \
752 |     S4<C4, ML_Z, ML_W, ML_W, ML_Y> zwwy; \
753 |     S4<C4, ML_Z, ML_W, ML_W, ML_Z> zwwz; \
754 |     S4<C4, ML_Z, ML_W, ML_W, ML_W> zwww; \
755 |     S4<C4, ML_W, ML_X, ML_X, ML_X> wxxx; \
756 |     S4<C4, ML_W, ML_X, ML_X, ML_Y> wxxy; \
757 |     S4<C4, ML_W, ML_X, ML_X, ML_Z> wxxz; \
758 |     S4<C4, ML_W, ML_X, ML_X, ML_W> wxxw; \
759 |     S4<C4, ML_W, ML_X, ML_Y, ML_X> wxyx; \
760 |     S4<C4, ML_W, ML_X, ML_Y, ML_Y> wxyy; \
761 |     S4<C4, ML_W, ML_X, ML_Y, ML_Z> wxyz; \
762 |     S4<C4, ML_W, ML_X, ML_Y, ML_W> wxyw; \
763 |     S4<C4, ML_W, ML_X, ML_Z, ML_X> wxzx; \
764 |     S4<C4, ML_W, ML_X, ML_Z, ML_Y> wxzy; \
765 |     S4<C4, ML_W, ML_X, ML_Z, ML_Z> wxzz; \
766 |     S4<C4, ML_W, ML_X, ML_Z, ML_W> wxzw; \
767 |     S4<C4, ML_W, ML_X, ML_W, ML_X> wxwx; \
768 |     S4<C4, ML_W, ML_X, ML_W, ML_Y> wxwy; \
769 |     S4<C4, ML_W, ML_X, ML_W, ML_Z> wxwz; \
770 |     S4<C4, ML_W, ML_X, ML_W, ML_W> wxww; \
771 |     S4<C4, ML_W, ML_Y, ML_X, ML_X> wyxx; \
772 |     S4<C4, ML_W, ML_Y, ML_X, ML_Y> wyxy; \
773 |     S4<C4, ML_W, ML_Y, ML_X, ML_Z> wyxz; \
774 |     S4<C4, ML_W, ML_Y, ML_X, ML_W> wyxw; \
775 |     S4<C4, ML_W, ML_Y, ML_Y, ML_X> wyyx; \
776 |     S4<C4, ML_W, ML_Y, ML_Y, ML_Y> wyyy; \
777 |     S4<C4, ML_W, ML_Y, ML_Y, ML_Z> wyyz; \
778 |     S4<C4, ML_W, ML_Y, ML_Y, ML_W> wyyw; \
779 |     S4<C4, ML_W, ML_Y, ML_Z, ML_X> wyzx; \
780 |     S4<C4, ML_W, ML_Y, ML_Z, ML_Y> wyzy; \
781 |     S4<C4, ML_W, ML_Y, ML_Z, ML_Z> wyzz; \
782 |     S4<C4, ML_W, ML_Y, ML_Z, ML_W> wyzw; \
783 |     S4<C4, ML_W, ML_Y, ML_W, ML_X> wywx; \
784 |     S4<C4, ML_W, ML_Y, ML_W, ML_Y> wywy; \
785 |     S4<C4, ML_W, ML_Y, ML_W, ML_Z> wywz; \
786 |     S4<C4, ML_W, ML_Y, ML_W, ML_W> wyww; \
787 |     S4<C4, ML_W, ML_Z, ML_X, ML_X> wzxx; \
788 |     S4<C4, ML_W, ML_Z, ML_X, ML_Y> wzxy; \
789 |     S4<C4, ML_W, ML_Z, ML_X, ML_Z> wzxz; \
790 |     S4<C4, ML_W, ML_Z, ML_X, ML_W> wzxw; \
791 |     S4<C4, ML_W, ML_Z, ML_Y, ML_X> wzyx; \
792 |     S4<C4, ML_W, ML_Z, ML_Y, ML_Y> wzyy; \
793 |     S4<C4, ML_W, ML_Z, ML_Y, ML_Z> wzyz; \
794 |     S4<C4, ML_W, ML_Z, ML_Y, ML_W> wzyw; \
795 |     S4<C4, ML_W, ML_Z, ML_Z, ML_X> wzzx; \
796 |     S4<C4, ML_W, ML_Z, ML_Z, ML_Y> wzzy; \
797 |     S4<C4, ML_W, ML_Z, ML_Z, ML_Z> wzzz; \
798 |     S4<C4, ML_W, ML_Z, ML_Z, ML_W> wzzw; \
799 |     S4<C4, ML_W, ML_Z, ML_W, ML_X> wzwx; \
800 |     S4<C4, ML_W, ML_Z, ML_W, ML_Y> wzwy; \
801 |     S4<C4, ML_W, ML_Z, ML_W, ML_Z> wzwz; \
802 |     S4<C4, ML_W, ML_Z, ML_W, ML_W> wzww; \
803 |     S4<C4, ML_W, ML_W, ML_X, ML_X> wwxx; \
804 |     S4<C4, ML_W, ML_W, ML_X, ML_Y> wwxy; \
805 |     S4<C4, ML_W, ML_W, ML_X, ML_Z> wwxz; \
806 |     S4<C4, ML_W, ML_W, ML_X, ML_W> wwxw; \
807 |     S4<C4, ML_W, ML_W, ML_Y, ML_X> wwyx; \
808 |     S4<C4, ML_W, ML_W, ML_Y, ML_Y> wwyy; \
809 |     S4<C4, ML_W, ML_W, ML_Y, ML_Z> wwyz; \
810 |     S4<C4, ML_W, ML_W, ML_Y, ML_W> wwyw; \
811 |     S4<C4, ML_W, ML_W, ML_Z, ML_X> wwzx; \
812 |     S4<C4, ML_W, ML_W, ML_Z, ML_Y> wwzy; \
813 |     S4<C4, ML_W, ML_W, ML_Z, ML_Z> wwzz; \
814 |     S4<C4, ML_W, ML_W, ML_Z, ML_W> wwzw; \
815 |     S4<C4, ML_W, ML_W, ML_W, ML_X> wwwx; \
816 |     S4<C4, ML_W, ML_W, ML_W, ML_Y> wwwy; \
817 |     S4<C4, ML_W, ML_W, ML_W, ML_Z> wwwz; \
818 |     S4<C4, ML_W, ML_W, ML_W, ML_W> wwww
819 | 


--------------------------------------------------------------------------------
/Guts/tests.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #define Test_Eps         0.00001
  4 | #define Test_ConstantEps 0.001 // hack for "fmod"
  5 | 
  6 | #define TestEqual_x2(C, T) ML_Assert(all(C((T)x1, (T)y1) == C((T)x1, (T)y1)))
  7 | #define TestEqual_x3(C, T) ML_Assert(all(C((T)x1, (T)y1, (T)z1) == C((T)x1, (T)y1, (T)z1)))
  8 | #define TestEqual_x4(C, T) ML_Assert(all(C((T)x1, (T)y1, (T)z1, (T)w1) == C((T)x1, (T)y1, (T)z1, (T)w1)))
  9 | 
 10 | #define TestNotEqual_x2(C, T) ML_Assert(any(C((T)x1, (T)y1) != C((T)y1, (T)x1)))
 11 | #define TestNotEqual_x3(C, T) ML_Assert(any(C((T)x1, (T)y1, (T)z1) != C((T)z1, (T)x1, (T)y1)))
 12 | #define TestNotEqual_x4(C, T) ML_Assert(any(C((T)x1, (T)y1, (T)z1, (T)w1) != C((T)w1, (T)z1, (T)y1, (T)x1)))
 13 | 
 14 | #define TestOp_x2(C, T, op) ML_Assert(all((C((T)x1, (T)y1) op C((T)x2, (T)y2)) == C((T)x1 op(T) x2, (T)y1 op(T) y2)))
 15 | #define TestOp_x3(C, T, op) ML_Assert(all((C((T)x1, (T)y1, (T)z1) op C((T)x2, (T)y2, (T)z2)) == C((T)x1 op(T) x2, (T)y1 op(T) y2, (T)z1 op(T) z2)))
 16 | #define TestOp_x4(C, T, op) ML_Assert(all((C((T)x1, (T)y1, (T)z1, (T)w1) op C((T)x2, (T)y2, (T)z2, (T)w2)) == C((T)x1 op(T) x2, (T)y1 op(T) y2, (T)z1 op(T) z2, (T)w1 op(T) w2)))
 17 | 
 18 | #define Test1_x2(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1)) == C(func((T)x1), func((T)y1))))
 19 | #define Test1_x3(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1, (T)z1)) == C(func((T)x1), func((T)y1), func((T)z1))))
 20 | #define Test1_x4(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1, (T)z1, (T)w1)) == C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))))
 21 | 
 22 | #define Test2_x2(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1), C((T)x2, (T)y2)) == C(func((T)x1, (T)x2), func((T)y1, (T)y2))))
 23 | #define Test2_x3(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1, (T)z1), C((T)x2, (T)y2, (T)z2)) == C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))))
 24 | 
 25 | #define Test2_x4(C, T, func) \
 26 |     ML_Assert(all(func(C((T)x1, (T)y1, (T)z1, (T)w1), C((T)x2, (T)y2, (T)z2, (T)w2)) == C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))))
 27 | 
 28 | #define Test1_x3_eps(C, T, func) \
 29 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1)) - C(func((T)x1), func((T)y1), func((T)z1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1))) * (T)Test_Eps))
 30 | 
 31 | #define Test1_x4_eps(C, T, func) \
 32 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1)) - C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) * (T)Test_Eps))
 33 | 
 34 | #define Test2_x3_eps(C, T, func) \
 35 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1), C((T)x2, (T)y2, (T)z2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))) <= abs(C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))) * (T)Test_Eps))
 36 | #define Test2_x4_eps(C, T, func) \
 37 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1), C((T)x2, (T)y2, (T)z2, (T)w2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))) <= abs(C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))) * (T)Test_Eps))
 38 | 
 39 | #define Test1_x3_ceps(C, T, func) \
 40 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1)) - C(func((T)x1), func((T)y1), func((T)z1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1))) * (T)Test_Eps))
 41 | 
 42 | #define Test1_x4_ceps(C, T, func) \
 43 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1)) - C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) * (T)Test_Eps))
 44 | 
 45 | #define Test2_x3_ceps(C, T, func) \
 46 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1), C((T)x2, (T)y2, (T)z2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))) <= (T)Test_ConstantEps))
 47 | #define Test2_x4_ceps(C, T, func) \
 48 |     ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1), C((T)x2, (T)y2, (T)z2, (T)w2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))) <= (T)Test_ConstantEps))
 49 | 
 50 | #include "../ml.hlsli"
 51 | 
 52 | #ifdef ML_NAMESPACE
 53 | namespace ml {
 54 | #endif
 55 | 
 56 | void ML_Tests() {
 57 |     const uint32_t N = 10000;
 58 |     const float R = 10000.0f;
 59 | 
 60 |     uint32_t rngState = 1983;
 61 |     Rng::Hash::Initialize(rngState, 0, 0);
 62 | 
 63 |     for (uint32_t i = 0; i < N; i++) {
 64 |         { // Ops
 65 |             float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 66 |             float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 67 |             float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 68 |             float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 69 | 
 70 |             float x2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 71 |             float y2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 72 |             float z2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 73 |             float w2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
 74 | 
 75 |             TestOp_x2(float2, float, -);
 76 |             TestOp_x2(float2, float, +);
 77 |             TestOp_x2(float2, float, *);
 78 |             TestOp_x2(float2, float, /);
 79 | 
 80 |             TestOp_x3(float3, float, -);
 81 |             TestOp_x3(float3, float, +);
 82 |             TestOp_x3(float3, float, *);
 83 |             TestOp_x3(float3, float, /);
 84 | 
 85 |             TestOp_x4(float4, float, -);
 86 |             TestOp_x4(float4, float, +);
 87 |             TestOp_x4(float4, float, *);
 88 |             TestOp_x4(float4, float, /);
 89 | 
 90 |             TestOp_x2(double2, double, -);
 91 |             TestOp_x2(double2, double, +);
 92 |             TestOp_x2(double2, double, *);
 93 |             TestOp_x2(double2, double, /);
 94 | 
 95 |             TestOp_x3(double3, double, -);
 96 |             TestOp_x3(double3, double, +);
 97 |             TestOp_x3(double3, double, *);
 98 |             TestOp_x3(double3, double, /);
 99 | 
100 |             TestOp_x4(double4, double, +);
101 |             TestOp_x4(double4, double, -);
102 |             TestOp_x4(double4, double, *);
103 |             TestOp_x4(double4, double, /);
104 | 
105 |             // Avoid division by "0" for integers
106 |             if (x2 > -1 && x2 < 1)
107 |                 x2 = 1;
108 | 
109 |             if (y2 > -1 && y2 < 1)
110 |                 y2 = 1;
111 | 
112 |             if (z2 > -1 && z2 < 1)
113 |                 z2 = 1;
114 | 
115 |             if (w2 > -1 && w2 < 1)
116 |                 w2 = 1;
117 | 
118 |             TestOp_x2(int2, int32_t, -);
119 |             TestOp_x2(int2, int32_t, +);
120 |             TestOp_x2(int2, int32_t, *);
121 |             TestOp_x2(int2, int32_t, /);
122 | 
123 |             TestOp_x3(int3, int32_t, -);
124 |             TestOp_x3(int3, int32_t, +);
125 |             TestOp_x3(int3, int32_t, *);
126 |             TestOp_x3(int3, int32_t, /);
127 | 
128 |             TestOp_x4(int4, int32_t, -);
129 |             TestOp_x4(int4, int32_t, +);
130 |             TestOp_x4(int4, int32_t, *);
131 |             TestOp_x4(int4, int32_t, /);
132 | 
133 |             TestOp_x2(uint2, uint32_t, -);
134 |             TestOp_x2(uint2, uint32_t, +);
135 |             TestOp_x2(uint2, uint32_t, *);
136 |             TestOp_x2(uint2, uint32_t, /);
137 | 
138 |             TestOp_x3(uint3, uint32_t, -);
139 |             TestOp_x3(uint3, uint32_t, +);
140 |             TestOp_x3(uint3, uint32_t, *);
141 |             TestOp_x3(uint3, uint32_t, /);
142 | 
143 |             TestOp_x4(uint4, uint32_t, -);
144 |             TestOp_x4(uint4, uint32_t, +);
145 |             TestOp_x4(uint4, uint32_t, *);
146 |             TestOp_x4(uint4, uint32_t, /);
147 |         }
148 | 
149 |         { // Integer ops
150 |             uint32_t x1 = Rng::Hash::GetUint(rngState);
151 |             uint32_t y1 = Rng::Hash::GetUint(rngState);
152 |             uint32_t z1 = Rng::Hash::GetUint(rngState);
153 |             uint32_t w1 = Rng::Hash::GetUint(rngState);
154 | 
155 |             uint32_t x2 = Rng::Hash::GetUint(rngState);
156 |             uint32_t y2 = Rng::Hash::GetUint(rngState);
157 |             uint32_t z2 = Rng::Hash::GetUint(rngState);
158 |             uint32_t w2 = Rng::Hash::GetUint(rngState);
159 | 
160 |             TestOp_x2(int2, int32_t, &);
161 |             TestOp_x2(int2, int32_t, |);
162 |             TestOp_x2(int2, int32_t, ^);
163 | 
164 |             TestOp_x3(int3, int32_t, &);
165 |             TestOp_x3(int3, int32_t, |);
166 |             TestOp_x3(int3, int32_t, ^);
167 | 
168 |             TestOp_x4(int4, int32_t, &);
169 |             TestOp_x4(int4, int32_t, |);
170 |             TestOp_x4(int4, int32_t, ^);
171 | 
172 |             TestOp_x2(uint2, uint32_t, &);
173 |             TestOp_x2(uint2, uint32_t, |);
174 |             TestOp_x2(uint2, uint32_t, ^);
175 | 
176 |             TestOp_x3(uint3, uint32_t, &);
177 |             TestOp_x3(uint3, uint32_t, |);
178 |             TestOp_x3(uint3, uint32_t, ^);
179 | 
180 |             TestOp_x4(uint4, uint32_t, &);
181 |             TestOp_x4(uint4, uint32_t, |);
182 |             TestOp_x4(uint4, uint32_t, ^);
183 | 
184 |             // Shifts and mod: use sane 2nd arg
185 |             x1 &= 0x7FFFFFFF;
186 |             y1 &= 0x7FFFFFFF;
187 |             z1 &= 0x7FFFFFFF;
188 |             w1 &= 0x7FFFFFFF;
189 | 
190 |             x2 &= 31;
191 |             y2 &= 31;
192 |             z2 &= 31;
193 |             w2 &= 31;
194 | 
195 |             TestOp_x2(int2, int32_t, <<);
196 |             TestOp_x2(int2, int32_t, >>);
197 | 
198 |             TestOp_x3(int3, int32_t, <<);
199 |             TestOp_x3(int3, int32_t, >>);
200 | 
201 |             TestOp_x4(int4, int32_t, <<);
202 |             TestOp_x4(int4, int32_t, >>);
203 | 
204 |             TestOp_x2(uint2, uint32_t, <<);
205 |             TestOp_x2(uint2, uint32_t, >>);
206 | 
207 |             TestOp_x3(uint3, uint32_t, <<);
208 |             TestOp_x3(uint3, uint32_t, >>);
209 | 
210 |             TestOp_x4(uint4, uint32_t, <<);
211 |             TestOp_x4(uint4, uint32_t, >>);
212 | 
213 |             // Avoid division by "0"
214 |             if (!x2)
215 |                 x2 = 1;
216 | 
217 |             if (!y2)
218 |                 y2 = 1;
219 | 
220 |             if (!z2)
221 |                 z2 = 1;
222 | 
223 |             if (!w2)
224 |                 w2 = 1;
225 | 
226 |             TestOp_x2(int2, int32_t, %);
227 |             TestOp_x3(int3, int32_t, %);
228 |             TestOp_x4(int4, int32_t, %);
229 |             TestOp_x2(uint2, uint32_t, %);
230 |             TestOp_x3(uint3, uint32_t, %);
231 |             TestOp_x4(uint4, uint32_t, %);
232 |         }
233 | 
234 |         { // Math [-INF, INF]
235 |             float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
236 |             float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
237 |             float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
238 |             float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
239 | 
240 |             float x2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
241 |             float y2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
242 |             float z2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
243 |             float w2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
244 | 
245 |             Test1_x2(float2, float, degrees);
246 |             Test1_x2(float2, float, radians);
247 |             Test1_x2(float2, float, sign);
248 |             Test1_x2(float2, float, abs);
249 |             Test1_x2(float2, float, floor);
250 |             Test1_x2(float2, float, ceil);
251 |             Test1_x2(float2, float, frac);
252 |             Test1_x2(float2, float, saturate);
253 |             Test2_x2(float2, float, min);
254 |             Test2_x2(float2, float, max);
255 |             Test2_x2(float2, float, step);
256 | 
257 |             Test1_x2(float2, float, rcp);
258 |             Test1_x2(float2, float, sin);
259 |             Test1_x2(float2, float, cos);
260 |             Test1_x2(float2, float, tan);
261 |             Test1_x2(float2, float, atan);
262 |             Test2_x2(float2, float, fmod);
263 |             Test2_x2(float2, float, atan2);
264 | 
265 |             Test1_x2(double2, double, degrees);
266 |             Test1_x2(double2, double, radians);
267 |             Test1_x2(double2, double, sign);
268 |             Test1_x2(double2, double, abs);
269 |             Test1_x2(double2, double, floor);
270 |             Test1_x2(double2, double, ceil);
271 |             Test1_x2(double2, double, frac);
272 |             Test1_x2(double2, double, saturate);
273 |             Test2_x2(double2, double, min);
274 |             Test2_x2(double2, double, max);
275 |             Test2_x2(double2, double, step);
276 | 
277 |             Test1_x2(double2, double, rcp);
278 |             Test1_x2(double2, double, sin);
279 |             Test1_x2(double2, double, cos);
280 |             Test1_x2(double2, double, tan);
281 |             Test1_x2(double2, double, atan);
282 |             Test2_x2(double2, double, fmod);
283 |             Test2_x2(double2, double, atan2);
284 | 
285 |             Test1_x3(float3, float, degrees);
286 |             Test1_x3(float3, float, radians);
287 |             Test1_x3(float3, float, sign);
288 |             Test1_x3(float3, float, abs);
289 |             Test1_x3(float3, float, floor);
290 |             Test1_x3(float3, float, ceil);
291 |             Test1_x3(float3, float, frac);
292 |             Test1_x3(float3, float, saturate);
293 |             Test2_x3(float3, float, min);
294 |             Test2_x3(float3, float, max);
295 |             Test2_x3(float3, float, step);
296 | 
297 |             Test1_x3_eps(float3, float, rcp);
298 |             Test1_x3_eps(float3, float, sin);
299 |             Test1_x3_eps(float3, float, cos);
300 |             Test1_x3_eps(float3, float, tan);
301 |             Test1_x3_eps(float3, float, atan);
302 |             Test2_x3_ceps(float3, float, fmod);
303 |             Test2_x3_eps(float3, float, atan2);
304 | 
305 |             Test1_x3(double3, double, degrees);
306 |             Test1_x3(double3, double, radians);
307 |             Test1_x3(double3, double, sign);
308 |             Test1_x3(double3, double, abs);
309 |             Test1_x3(double3, double, floor);
310 |             Test1_x3(double3, double, ceil);
311 |             Test1_x3(double3, double, frac);
312 |             Test1_x3(double3, double, saturate);
313 |             Test2_x3(double3, double, min);
314 |             Test2_x3(double3, double, max);
315 |             Test2_x3(double3, double, step);
316 | 
317 |             Test1_x3_eps(double3, double, rcp);
318 |             Test1_x3_eps(double3, double, sin);
319 |             Test1_x3_eps(double3, double, cos);
320 |             Test1_x3_eps(double3, double, tan);
321 |             Test1_x3_eps(double3, double, atan);
322 |             Test2_x3_ceps(double3, double, fmod);
323 |             Test2_x3_eps(double3, double, atan2);
324 | 
325 |             Test1_x4(float4, float, degrees);
326 |             Test1_x4(float4, float, radians);
327 |             Test1_x4(float4, float, sign);
328 |             Test1_x4(float4, float, abs);
329 |             Test1_x4(float4, float, floor);
330 |             Test1_x4(float4, float, ceil);
331 |             Test1_x4(float4, float, frac);
332 |             Test1_x4(float4, float, saturate);
333 |             Test2_x4(float4, float, min);
334 |             Test2_x4(float4, float, max);
335 |             Test2_x4(float4, float, step);
336 | 
337 |             Test1_x4_eps(float4, float, rcp);
338 |             Test1_x4_eps(float4, float, sin);
339 |             Test1_x4_eps(float4, float, cos);
340 |             Test1_x4_eps(float4, float, tan);
341 |             Test1_x4_eps(float4, float, atan);
342 |             Test2_x4_ceps(float4, float, fmod);
343 |             Test2_x4_eps(float4, float, atan2);
344 | 
345 |             Test1_x4(double4, double, degrees);
346 |             Test1_x4(double4, double, radians);
347 |             Test1_x4(double4, double, sign);
348 |             Test1_x4(double4, double, abs);
349 |             Test1_x4(double4, double, floor);
350 |             Test1_x4(double4, double, ceil);
351 |             Test1_x4(double4, double, frac);
352 |             Test1_x4(double4, double, saturate);
353 |             Test2_x4(double4, double, min);
354 |             Test2_x4(double4, double, max);
355 |             Test2_x4(double4, double, step);
356 | 
357 |             Test1_x4_eps(double4, double, rcp);
358 |             Test1_x4_eps(double4, double, sin);
359 |             Test1_x4_eps(double4, double, cos);
360 |             Test1_x4_eps(double4, double, tan);
361 |             Test1_x4_eps(double4, double, atan);
362 |             Test2_x4_ceps(double4, double, fmod);
363 |             Test2_x4_eps(double4, double, atan2);
364 | 
365 |             // round: avoid fractional part = 0.5
366 |             if (frac(x1) == 0.5f)
367 |                 x1 = uFloat(uFloat(x1).i + 1).f;
368 | 
369 |             if (frac(y1) == 0.5f)
370 |                 y1 = uFloat(uFloat(y1).i + 1).f;
371 | 
372 |             if (frac(z1) == 0.5f)
373 |                 z1 = uFloat(uFloat(z1).i + 1).f;
374 | 
375 |             if (frac(w1) == 0.5f)
376 |                 w1 = uFloat(uFloat(w1).i + 1).f;
377 | 
378 |             Test1_x2(float2, float, round);
379 |             Test1_x3(float3, float, round);
380 |             Test1_x4(float4, float, round);
381 | 
382 |             Test1_x2(double2, double, round);
383 |             Test1_x3(double3, double, round);
384 |             Test1_x4(double4, double, round);
385 | 
386 |             // pow/exp/exp2: do not use to large "x" and "y" to avoid "INF"
387 |             x1 *= 32.0f / R;
388 |             y1 *= 32.0f / R;
389 |             z1 *= 32.0f / R;
390 |             w1 *= 32.0f / R;
391 | 
392 |             x2 *= 32.0f / R;
393 |             y2 *= 32.0f / R;
394 |             z2 *= 32.0f / R;
395 |             w2 *= 32.0f / R;
396 | 
397 |             Test1_x2(float2, float, exp);
398 |             Test1_x3_eps(float3, float, exp);
399 |             Test1_x4_eps(float4, float, exp);
400 | 
401 |             Test1_x2(double2, double, exp);
402 |             Test1_x3_eps(double3, double, exp);
403 |             Test1_x4_eps(double4, double, exp);
404 | 
405 |             Test1_x2(float2, float, exp2);
406 |             Test1_x3_eps(float3, float, exp2);
407 |             Test1_x4_eps(float4, float, exp2);
408 | 
409 |             Test1_x2(double2, double, exp2);
410 |             Test1_x3_eps(double3, double, exp2);
411 |             Test1_x4_eps(double4, double, exp2);
412 | 
413 |             // pow: "x" must be positive
414 |             x1 = abs(x1);
415 |             y1 = abs(y1);
416 |             z1 = abs(z1);
417 |             w1 = abs(w1);
418 | 
419 |             Test2_x2(float2, float, pow);
420 |             Test2_x3_eps(float3, float, pow);
421 |             Test2_x4_eps(float4, float, pow);
422 | 
423 |             Test2_x2(double2, double, pow);
424 |             Test2_x3_eps(double3, double, pow);
425 |             Test2_x4_eps(double4, double, pow);
426 |         }
427 | 
428 |         { // Math (> 0)
429 |             float x1 = Rng::Hash::GetFloat(rngState) * R;
430 |             float y1 = Rng::Hash::GetFloat(rngState) * R;
431 |             float z1 = Rng::Hash::GetFloat(rngState) * R;
432 |             float w1 = Rng::Hash::GetFloat(rngState) * R;
433 | 
434 |             Test1_x2(float2, float, rsqrt);
435 |             Test1_x2(float2, float, sqrt);
436 |             Test1_x2(float2, float, log);
437 |             Test1_x2(float2, float, log2);
438 | 
439 |             Test1_x2(double2, double, rsqrt);
440 |             Test1_x2(double2, double, sqrt);
441 |             Test1_x2(double2, double, log);
442 |             Test1_x2(double2, double, log2);
443 | 
444 |             Test1_x3_eps(float3, float, rsqrt);
445 |             Test1_x3_eps(float3, float, sqrt);
446 |             Test1_x3_eps(float3, float, log);
447 |             Test1_x3_eps(float3, float, log2);
448 | 
449 |             Test1_x3_eps(double3, double, rsqrt);
450 |             Test1_x3_eps(double3, double, sqrt);
451 |             Test1_x3_eps(double3, double, log);
452 |             Test1_x3_eps(double3, double, log2);
453 | 
454 |             Test1_x4_eps(float4, float, rsqrt);
455 |             Test1_x4_eps(float4, float, sqrt);
456 |             Test1_x4_eps(float4, float, log);
457 |             Test1_x4_eps(float4, float, log2);
458 | 
459 |             Test1_x4_eps(double4, double, rsqrt);
460 |             Test1_x4_eps(double4, double, sqrt);
461 |             Test1_x4_eps(double4, double, log);
462 |             Test1_x4_eps(double4, double, log2);
463 |         }
464 | 
465 |         { // Math [-1; 1]
466 |             float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f;
467 |             float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f;
468 |             float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f;
469 |             float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f;
470 | 
471 |             Test1_x2(float2, float, asin);
472 |             Test1_x2(float2, float, acos);
473 | 
474 |             Test1_x2(double2, double, asin);
475 |             Test1_x2(double2, double, acos);
476 | 
477 |             Test1_x3_eps(float3, float, asin);
478 |             Test1_x3_eps(float3, float, acos);
479 | 
480 |             Test1_x3_eps(double3, double, asin);
481 |             Test1_x3_eps(double3, double, acos);
482 | 
483 |             Test1_x4_eps(float4, float, asin);
484 |             Test1_x4_eps(float4, float, acos);
485 | 
486 |             Test1_x4_eps(double4, double, asin);
487 |             Test1_x4_eps(double4, double, acos);
488 |         }
489 |     }
490 | 
491 |     { // == and !=
492 |         float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
493 |         float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
494 |         float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
495 |         float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R;
496 | 
497 |         TestEqual_x2(int2, int32_t);
498 |         TestEqual_x3(int3, int32_t);
499 |         TestEqual_x4(int4, int32_t);
500 | 
501 |         TestEqual_x2(uint2, uint32_t);
502 |         TestEqual_x3(uint3, uint32_t);
503 |         TestEqual_x4(uint4, uint32_t);
504 | 
505 |         TestEqual_x2(float2, float);
506 |         TestEqual_x3(float3, float);
507 |         TestEqual_x4(float4, float);
508 | 
509 |         TestEqual_x2(double2, double);
510 |         TestEqual_x3(double3, double);
511 |         TestEqual_x4(double4, double);
512 | 
513 |         TestNotEqual_x2(int2, int32_t);
514 |         TestNotEqual_x3(int3, int32_t);
515 |         TestNotEqual_x4(int4, int32_t);
516 | 
517 |         TestNotEqual_x2(uint2, uint32_t);
518 |         TestNotEqual_x3(uint3, uint32_t);
519 |         TestNotEqual_x4(uint4, uint32_t);
520 | 
521 |         TestNotEqual_x2(float2, float);
522 |         TestNotEqual_x3(float3, float);
523 |         TestNotEqual_x4(float4, float);
524 | 
525 |         TestNotEqual_x2(double2, double);
526 |         TestNotEqual_x3(double3, double);
527 |         TestNotEqual_x4(double4, double);
528 |     }
529 | 
530 |     { // +0 == -0
531 |         ML_Assert(all(+0 == -0));
532 |         ML_Assert(all(int2(+0) == int2(-0)));
533 |         ML_Assert(all(int3(+0) == int3(-0)));
534 |         ML_Assert(all(int4(+0) == int4(-0)));
535 | 
536 |         ML_Assert(all(+0.0f == -0.0f));
537 |         ML_Assert(all(float2(+0.0f) == float2(-0.0f)));
538 |         ML_Assert(all(float3(+0.0f) == float3(-0.0f)));
539 |         ML_Assert(all(float4(+0.0f) == float4(-0.0f)));
540 | 
541 |         ML_Assert(all(+0.0 == -0.0));
542 |         ML_Assert(all(float2(+0.0) == float2(-0.0)));
543 |         ML_Assert(all(float3(+0.0) == float3(-0.0)));
544 |         ML_Assert(all(float4(+0.0) == float4(-0.0)));
545 |     }
546 | 
547 |     { // NAN != NAN
548 |         float myNanf = log(0.0f) * 0.0f;
549 |         ML_Assert(myNanf != myNanf);
550 |         ML_Assert(any(float2(myNanf) != float2(myNanf)));
551 |         ML_Assert(any(float3(myNanf) != float3(myNanf)));
552 |         ML_Assert(any(float4(myNanf) != float4(myNanf)));
553 | 
554 |         double myNan = log(0.0) * 0.0;
555 |         ML_Assert(myNan != myNan);
556 |         ML_Assert(any(double2(myNan) != double2(myNan)));
557 |         ML_Assert(any(double3(myNan) != double3(myNan)));
558 |         ML_Assert(any(double4(myNan) != double4(myNan)));
559 |     }
560 | }
561 | 
562 | #ifdef ML_NAMESPACE
563 | }
564 | #endif
565 | 


--------------------------------------------------------------------------------
/Guts/u32.h:
--------------------------------------------------------------------------------
  1 | // © 2021 NVIDIA Corporation
  2 | 
  3 | #pragma once
  4 | 
  5 | //======================================================================================================================
  6 | // uint2
  7 | //======================================================================================================================
  8 | 
  9 | union uint2 {
 10 |     v2i mm;
 11 | 
 12 |     struct {
 13 |         uint32_t a[COORD_2D];
 14 |     };
 15 | 
 16 |     struct {
 17 |         uint32_t x, y;
 18 |     };
 19 | 
 20 |     ML_SWIZZLE_2(uint2, uint32_t);
 21 | 
 22 | public:
 23 |     ML_INLINE uint2()
 24 |         : mm(0) {
 25 |     }
 26 | 
 27 |     ML_INLINE uint2(uint32_t c)
 28 |         : x(c), y(c) {
 29 |     }
 30 | 
 31 |     ML_INLINE uint2(uint32_t _x, uint32_t _y)
 32 |         : x(_x), y(_y) {
 33 |     }
 34 | 
 35 |     ML_INLINE uint2(const uint2& v) = default;
 36 | 
 37 |     // Set
 38 | 
 39 |     ML_INLINE void operator=(const uint2& v) {
 40 |         mm = v.mm;
 41 |     }
 42 | 
 43 |     // Conversion
 44 | 
 45 |     ML_INLINE operator int2() const;
 46 |     ML_INLINE operator float2() const;
 47 |     ML_INLINE operator double2() const;
 48 | 
 49 |     // Compare
 50 | 
 51 |     ML_COMPARE_UNOPT(bool2, uint2, <)
 52 |     ML_COMPARE_UNOPT(bool2, uint2, <=)
 53 |     ML_COMPARE_UNOPT(bool2, uint2, ==)
 54 |     ML_COMPARE_UNOPT(bool2, uint2, >=)
 55 |     ML_COMPARE_UNOPT(bool2, uint2, >)
 56 |     ML_COMPARE_UNOPT(bool2, uint2, !=)
 57 | 
 58 |     // Ops
 59 | 
 60 |     ML_OP_UNOPT(uint2, uint32_t, -, -=)
 61 |     ML_OP_UNOPT(uint2, uint32_t, +, +=)
 62 |     ML_OP_UNOPT(uint2, uint32_t, *, *=)
 63 |     ML_OP_UNOPT(uint2, uint32_t, /, /=)
 64 |     ML_OP_UNOPT(uint2, uint32_t, %, %=)
 65 |     ML_OP_UNOPT(uint2, uint32_t, <<, <<=)
 66 |     ML_OP_UNOPT(uint2, uint32_t, >>, >>=)
 67 |     ML_OP_UNOPT(uint2, uint32_t, &, &=)
 68 |     ML_OP_UNOPT(uint2, uint32_t, |, |=)
 69 |     ML_OP_UNOPT(uint2, uint32_t, ^, ^=)
 70 | };
 71 | 
 72 | ML_INLINE uint2 min(const uint2& x, const uint2& y) {
 73 |     return uint2(min(x.x, y.x), min(x.y, y.y));
 74 | }
 75 | 
 76 | ML_INLINE uint2 max(const uint2& x, const uint2& y) {
 77 |     return uint2(max(x.x, y.x), max(x.y, y.y));
 78 | }
 79 | 
 80 | //======================================================================================================================
 81 | // uint3
 82 | //======================================================================================================================
 83 | 
 84 | union uint3 {
 85 |     v4i xmm;
 86 | 
 87 |     struct {
 88 |         uint32_t a[COORD_3D];
 89 |     };
 90 | 
 91 |     struct {
 92 |         uint32_t x, y, z;
 93 |     };
 94 | 
 95 |     ML_SWIZZLE_3(v4u_swizzle2, uint2, v4u_swizzle3, uint3);
 96 | 
 97 | public:
 98 |     ML_INLINE uint3()
 99 |         : xmm(_mm_setzero_si128()) {
100 |     }
101 | 
102 |     ML_INLINE uint3(uint32_t c)
103 |         : xmm(_mm_set1_epi32(c)) {
104 |     }
105 | 
106 |     ML_INLINE uint3(uint32_t _x, uint32_t _y, uint32_t _z)
107 |         : xmm(v4i_set(_x, _y, _z, 1)) {
108 |     }
109 | 
110 |     ML_INLINE uint3(const uint2& v, uint32_t _z)
111 |         : xmm(v4i_set(v.x, v.y, _z, 1)) {
112 |     }
113 | 
114 |     ML_INLINE uint3(uint32_t _x, const uint2& v)
115 |         : xmm(v4i_set(_x, v.x, v.y, 1)) {
116 |     }
117 | 
118 |     ML_INLINE uint3(const v4i& v)
119 |         : xmm(v) {
120 |     }
121 | 
122 |     ML_INLINE uint3(const uint32_t* v3)
123 |         : xmm(v4i_set(v3[0], v3[1], v3[2], 1)) {
124 |     }
125 | 
126 |     ML_INLINE uint3(const uint3& v) = default;
127 | 
128 |     // Set
129 | 
130 |     ML_INLINE void operator=(const uint3& v) {
131 |         xmm = v.xmm;
132 |     }
133 | 
134 |     // Conversion
135 | 
136 |     ML_INLINE operator int3() const;
137 |     ML_INLINE operator float3() const;
138 |     ML_INLINE operator double3() const;
139 | 
140 |     // Compare
141 | 
142 |     ML_COMPARE(bool3, uint3, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm)
143 |     ML_COMPARE(bool3, uint3, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm)
144 |     ML_COMPARE(bool3, uint3, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm)
145 |     ML_COMPARE(bool3, uint3, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm)
146 |     ML_COMPARE(bool3, uint3, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm)
147 |     ML_COMPARE(bool3, uint3, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm)
148 | 
149 |     // Ops
150 | 
151 |     ML_OP(uint3, uint32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm)
152 |     ML_OP(uint3, uint32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm)
153 |     ML_OP(uint3, uint32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm)
154 |     ML_OP(uint3, uint32_t, /, /=, _mm_div_epu32, _mm_set1_epi32, xmm)
155 |     ML_OP(uint3, uint32_t, %, %=, v4u_mod, _mm_set1_epi32, xmm)
156 |     ML_OP(uint3, uint32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm)
157 |     ML_OP(uint3, uint32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm)
158 |     ML_OP(uint3, uint32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm)
159 |     ML_OP(uint3, uint32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm)
160 |     ML_OP(uint3, uint32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm)
161 | 
162 |     // Misc
163 | 
164 |     ML_INLINE operator v4i() const {
165 |         return xmm;
166 |     }
167 | 
168 |     static ML_INLINE uint3 Zero() {
169 |         return _mm_setzero_si128();
170 |     }
171 | };
172 | 
173 | ML_INLINE uint3 min(const uint3& x, const uint3& y) {
174 |     return _mm_min_epu32(x.xmm, y.xmm);
175 | }
176 | 
177 | ML_INLINE uint3 max(const uint3& x, const uint3& y) {
178 |     return _mm_max_epu32(x.xmm, y.xmm);
179 | }
180 | 
181 | //======================================================================================================================
182 | // uint4
183 | //======================================================================================================================
184 | 
185 | union uint4 {
186 |     v4i xmm;
187 | 
188 |     struct {
189 |         uint32_t a[COORD_4D];
190 |     };
191 | 
192 |     struct {
193 |         uint32_t x, y, z, w;
194 |     };
195 | 
196 |     ML_SWIZZLE_4(v4u_swizzle2, uint2, v4u_swizzle3, uint3, v4u_swizzle4, uint4);
197 | 
198 | public:
199 |     ML_INLINE uint4()
200 |         : xmm(_mm_setzero_si128()) {
201 |     }
202 | 
203 |     ML_INLINE uint4(uint32_t c)
204 |         : xmm(_mm_set1_epi32(c)) {
205 |     }
206 | 
207 |     ML_INLINE uint4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
208 |         : xmm(v4i_set(_x, _y, _z, _w)) {
209 |     }
210 | 
211 |     ML_INLINE uint4(const uint3& v, uint32_t _w)
212 |         : xmm(v4i_set(v.x, v.y, v.z, _w)) {
213 |     }
214 | 
215 |     ML_INLINE uint4(const uint2& a, const uint2& b)
216 |         : xmm(v4i_set(a.x, a.y, b.x, b.y)) {
217 |     }
218 | 
219 |     ML_INLINE uint4(uint32_t _x, const uint3& v)
220 |         : xmm(v4i_set(_x, v.x, v.y, v.z)) {
221 |     }
222 | 
223 |     ML_INLINE uint4(const v4i& v)
224 |         : xmm(v) {
225 |     }
226 | 
227 |     ML_INLINE uint4(const uint4& v) = default;
228 | 
229 |     // Set
230 | 
231 |     ML_INLINE void operator=(const uint4& v) {
232 |         xmm = v.xmm;
233 |     }
234 | 
235 |     // Conversion
236 | 
237 |     ML_INLINE operator int4() const;
238 |     ML_INLINE operator float4() const;
239 |     ML_INLINE operator double4() const;
240 | 
241 |     // Compare
242 | 
243 |     ML_COMPARE(bool4, uint4, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm)
244 |     ML_COMPARE(bool4, uint4, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm)
245 |     ML_COMPARE(bool4, uint4, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm)
246 |     ML_COMPARE(bool4, uint4, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm)
247 |     ML_COMPARE(bool4, uint4, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm)
248 |     ML_COMPARE(bool4, uint4, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm)
249 | 
250 |     // Ops
251 | 
252 |     ML_OP(uint4, uint32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm)
253 |     ML_OP(uint4, uint32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm)
254 |     ML_OP(uint4, uint32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm)
255 |     ML_OP(uint4, uint32_t, /, /=, _mm_div_epu32, _mm_set1_epi32, xmm)
256 |     ML_OP(uint4, uint32_t, %, %=, v4u_mod, _mm_set1_epi32, xmm)
257 |     ML_OP(uint4, uint32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm)
258 |     ML_OP(uint4, uint32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm)
259 |     ML_OP(uint4, uint32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm)
260 |     ML_OP(uint4, uint32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm)
261 |     ML_OP(uint4, uint32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm)
262 | 
263 |     // Misc
264 | 
265 |     ML_INLINE operator v4i() const {
266 |         return xmm;
267 |     }
268 | 
269 |     static ML_INLINE uint4 Zero() {
270 |         return _mm_setzero_si128();
271 |     }
272 | };
273 | 
274 | ML_INLINE uint4 min(const uint4& x, const uint4& y) {
275 |     return _mm_min_epu32(x.xmm, y.xmm);
276 | }
277 | 
278 | ML_INLINE uint4 max(const uint4& x, const uint4& y) {
279 |     return _mm_max_epu32(x.xmm, y.xmm);
280 | }
281 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a
 4 | copy of this software and associated documentation files (the "Software"),
 5 | to deal in the Software without restriction, including without limitation
 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | and/or sell copies of the Software, and to permit persons to whom the
 8 | Software is furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
16 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19 | DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MathLib (ML)
 2 | 
 3 | *ML* is a cross-platform header-only *SSE/AVX/NEON*-accelerated math library, designed for computer graphics. It serves two goals:
 4 | - accelerate performance using *SSE/AVX/NEON* intrinsics
 5 | - be HLSL compatible and deliver functionality to both CPU and shader code without code duplication
 6 | 
 7 | Features:
 8 | - compile-time optimization level specialization: SSE3 (and below), +SSE4, +AVX1, +AVX2 (or NEON on ARM via [*sse2neon*](https://github.com/DLTcollab/sse2neon))
 9 | - `int2`, `int3` and `int4` types
10 | - `uint2`, `uint3` and `uint4` types
11 | - `float2`, `float3`, `float4` and `float4x4` types
12 | - `double2`, `double3`, `double4` and `double4x4` types
13 | - `bool2`, `bool3` and `bool4` types
14 | - overloaded operators
15 | - vector swizzling
16 | - common functions: `all`, `any`, `sign`, `abs`, `floor`, `round`, `ceil`, `fmod`, `frac`, `min`, `max`, `clamp`, `saturate`, `lerp`, `step`, `smoothstep` and `linearstep`
17 | - transcendental functions: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`, `atan2`, `sqrt`, `rsqrt`, `rcp`, `pow`, `log`, `log2`, `exp` and `exp2`
18 | - data conversion and packing functionality - FP32, FP16, SNORM and UNORM (with any number of bits per component)
19 | - vectors and matrices
20 | - linear algebra miscellaneous functionality
21 | - projective math miscellaneous functionality
22 | - frustum & AABB primitives
23 | - random numbers generation
24 | - sorting
25 | 
26 | Important:
27 | - `sizeof(int3/uint3/float3) == sizeof(float4)` on CPU
28 | - `sizeof(double3) == sizeof(double4)` on CPU
29 | - `using namespace std` can lead to name collisions
30 | - inclusion of `cmath` and/or `cstdlib` (even implicitly) after `ml.h` leads to name collisions
31 | 
32 | Also includes `ml.hlsli` file which is a standalone HLSL math library usable in C++ code.
33 | 
34 | ## License
35 | 
36 | *ML* is licensed under the MIT License.
37 | 


--------------------------------------------------------------------------------
/ml.h:
--------------------------------------------------------------------------------
   1 | // © 2021 NVIDIA Corporation
   2 | 
   3 | /*
   4 | IMPORTANT:
   5 | - intrinsic related headers must not be included *AFTER* ML inclusion
   6 | - "ML_NAMESPACE" macro can be defined to wrap the entire ML into "ml" namespace
   7 | - sizeof(3-component vector) == sizeof(4-component vector) because of SSE
   8 | */
   9 | 
  10 | #pragma once
  11 | 
  12 | #define ML_VERSION      9
  13 | #define ML_VERSION_DATE "2 October 2025"
  14 | 
  15 | //======================================================================================================================
  16 | // Constants
  17 | //======================================================================================================================
  18 | 
  19 | // Intrinsic levels (everything above "ML_INTRINSIC_LEVEL" is emulated)
  20 | #define ML_INTRINSIC_SSE3 0 // +SSE1, +SSE2, +SSE3, +SSSE3 ("-mssse3" in GCC/Clang)
  21 | #define ML_INTRINSIC_SSE4 1 // +SSE4.1, +SSE4.2 ("-msse4.2" in GCC/Clang)
  22 | #define ML_INTRINSIC_AVX1 2 // +AVX1, +FP16C ("-mf16c" in GCC/Clang)
  23 | #define ML_INTRINSIC_AVX2 3 // +AVX2, +FMA3, +bit shift, +swizzle ("-mavx2 -mfma" in GCC/Clang)
  24 | 
  25 | //======================================================================================================================
  26 | // Settings
  27 | //======================================================================================================================
  28 | 
  29 | // Can be set to wrap the library into "ml" namespace
  30 | #ifndef ML_NAMESPACE
  31 | // #define ML_NAMESPACE
  32 | #endif
  33 | 
  34 | // Selected intrinsic level (try to guess)
  35 | #ifndef ML_INTRINSIC_LEVEL
  36 | #    if (defined(__AVX2__) && defined(__FMA__))
  37 | #        define ML_INTRINSIC_LEVEL ML_INTRINSIC_AVX2
  38 | #    elif defined(__F16C__)
  39 | #        define ML_INTRINSIC_LEVEL ML_INTRINSIC_AVX1
  40 | #    elif defined(__SSE4_2__)
  41 | #        define ML_INTRINSIC_LEVEL ML_INTRINSIC_SSE4
  42 | #    else
  43 | #        define ML_INTRINSIC_LEVEL ML_INTRINSIC_SSE3
  44 | #    endif
  45 | #endif
  46 | 
  47 | // ARM?
  48 | #if (defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM))
  49 | #    define ML_ARM
  50 | #endif
  51 | 
  52 | // SVML availability
  53 | #ifndef ML_SVML_AVAILABLE
  54 | #    ifdef ML_ARM
  55 | #        define ML_SVML_AVAILABLE 0
  56 | #    else
  57 | #        define ML_SVML_AVAILABLE (_MSC_VER >= 1920 && __clang__ == 0)
  58 | #    endif
  59 | #endif
  60 | 
  61 | // More precision (a little bit slower)
  62 | #ifndef ML_NEWTONRAPHSON_APROXIMATION
  63 | #    define ML_NEWTONRAPHSON_APROXIMATION 1
  64 | #endif
  65 | 
  66 | // Only for debugging (useful to debug issues in horizontal operations)
  67 | #ifndef ML_CHECK_W_IS_ZERO
  68 | #    define ML_CHECK_W_IS_ZERO 0
  69 | #endif
  70 | 
  71 | // Only for debugging (generate exeptions in rounding operations, only for SSE4)
  72 | #ifndef ML_EXEPTIONS
  73 | #    define ML_EXEPTIONS 0
  74 | #endif
  75 | 
  76 | // Reversed depth
  77 | #ifndef ML_DEPTH_REVERSED
  78 | #    define ML_DEPTH_REVERSED 1
  79 | #endif
  80 | 
  81 | // Can be handy for classic OpenGL
  82 | #ifndef ML_OGL
  83 | #    define ML_OGL 0
  84 | #endif
  85 | 
  86 | // Depth range
  87 | #ifndef ML_DEPTH_RANGE_NEAR
  88 | #    define ML_DEPTH_RANGE_NEAR 0.0f
  89 | #endif
  90 | 
  91 | #ifndef ML_ML_DEPTH_RANGE_FAR
  92 | #    define ML_DEPTH_RANGE_FAR 1.0f
  93 | #endif
  94 | 
  95 | // Inline preference
  96 | #ifndef ML_INLINE
  97 | #    if (defined(__GNUC__) || defined(__clang__))
  98 | #        define ML_INLINE __attribute__((always_inline)) inline
  99 | #    else
 100 | #        define ML_INLINE __forceinline
 101 | #    endif
 102 | #endif
 103 | 
 104 | //======================================================================================================================
 105 | // Macro stuff
 106 | //======================================================================================================================
 107 | 
 108 | // Compiler and environment
 109 | 
 110 | #if defined(__GNUC__)
 111 | #    pragma GCC diagnostic push
 112 | #    pragma GCC diagnostic ignored "-Wstrict-aliasing"
 113 | 
 114 | #    define ML_ALIGN(alignment, x) x __attribute__((aligned(alignment)))
 115 | #elif defined(__clang__)
 116 | #    pragma clang diagnostic push
 117 | #    pragma clang diagnostic ignored "-Wstrict-aliasing"
 118 | 
 119 | #    define ML_ALIGN(alignment, x) x __attribute__((aligned(alignment)))
 120 | #else
 121 | #    pragma warning(push)
 122 | #    pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union
 123 | 
 124 | #    define ML_ALIGN(alignment, x) __declspec(align(alignment)) x
 125 | #endif
 126 | 
 127 | // Headers
 128 | 
 129 | #include <cmath>   // overloaded floor, round, ceil, fmod, sin, cos, tan, asin, acos, atan, atan2, sqrt, pow, log, log2, exp, exp2
 130 | #include <cstdlib> // overloaded abs
 131 | 
 132 | #include <stdint.h>
 133 | 
 134 | #ifndef _WIN32
 135 | #    include <unistd.h> // TODO: needed?
 136 | #endif
 137 | 
 138 | #if (defined(__i386__) || defined(__x86_64__) || defined(__SCE__))
 139 | #    include <x86intrin.h>
 140 | #elif (defined(ML_ARM))
 141 | #    include "sse2neon.h"
 142 | #else
 143 | #    include <mmintrin.h>
 144 | #    if (ML_SVML_AVAILABLE || ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1)
 145 | #        include <immintrin.h> // SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA, SVML
 146 | #    elif (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_SSE4)
 147 | #        include <nmmintrin.h> // SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2
 148 | #    else
 149 | #        include <tmmintrin.h> // SSE, SSE2, SSE3, SSSE3
 150 | #    endif
 151 | #endif
 152 | 
 153 | // Misc
 154 | #define ML_Unused(...) \
 155 |     do { \
 156 |         (void)sizeof(__VA_ARGS__); \
 157 |     } while (0)
 158 | #define ML_Stringify_(token) #token
 159 | #define ML_Stringify(token)  ML_Stringify_(token)
 160 | 
 161 | #if ML_EXEPTIONS
 162 | #    define ML_ROUNDING_EXEPTIONS_MASK _MM_FROUND_RAISE_EXC
 163 | #else
 164 | #    define ML_ROUNDING_EXEPTIONS_MASK _MM_FROUND_NO_EXC
 165 | #endif
 166 | 
 167 | // Debugging
 168 | 
 169 | #define ML_StaticAssertMsg(x, msg) static_assert(x, msg)
 170 | 
 171 | #ifdef _DEBUG
 172 | #    include <assert.h> // assert
 173 | 
 174 | #    define ML_Assert(x)         assert(x)
 175 | #    define ML_AssertMsg(x, msg) assert(msg&& x)
 176 | #else
 177 | #    define ML_Assert(x)         ((void)0)
 178 | #    define ML_AssertMsg(x, msg) ((void)0)
 179 | #endif
 180 | 
 181 | // Normalized device coordinates
 182 | 
 183 | #if ML_OGL // Depth range [-1; 1], origin "lower left"
 184 | #    define ML_NDC_NEAR_NO_REVERSE -1.0f
 185 | #    define ML_DEPTH_C0            (0.5f * (ML_DEPTH_RANGE_FAR - ML_DEPTH_RANGE_NEAR))
 186 | #    define ML_DEPTH_C1            (0.5f * (ML_DEPTH_RANGE_FAR + ML_DEPTH_RANGE_NEAR))
 187 | 
 188 | template <class T>
 189 | ML_INLINE T ML_ModifyProjZ(bool isReversed, T c2, T c3) {
 190 |     return isReversed ? -c2 : c2;
 191 | }
 192 | 
 193 | #else // Depth range [0; 1], origin "upper left"
 194 | #    define ML_NDC_NEAR_NO_REVERSE 0.0f
 195 | #    define ML_DEPTH_C0            (ML_DEPTH_RANGE_FAR - ML_DEPTH_RANGE_NEAR)
 196 | #    define ML_DEPTH_C1            ML_DEPTH_RANGE_NEAR
 197 | 
 198 | template <class T>
 199 | ML_INLINE T ML_ModifyProjZ(bool isReversed, T c2, T c3) {
 200 |     return T(0.5) * ((isReversed ? -c2 : c2) + c3);
 201 | }
 202 | 
 203 | #endif
 204 | 
 205 | #define ML_NDC_FAR_NO_REVERSE 1.0f
 206 | 
 207 | #if ML_DEPTH_REVERSED
 208 | #    define ML_NDC_NEAR  ML_NDC_FAR_NO_REVERSE
 209 | #    define ML_NDC_FAR   ML_NDC_NEAR_NO_REVERSE
 210 | #    define ML_DEPTH_EPS -1e-7f
 211 | #else
 212 | #    define ML_NDC_NEAR  ML_NDC_NEAR_NO_REVERSE
 213 | #    define ML_NDC_FAR   ML_NDC_FAR_NO_REVERSE
 214 | #    define ML_DEPTH_EPS 1e-7f
 215 | #endif
 216 | 
 217 | // TODO
 218 | 
 219 | /*
 220 | - add some missing HLSL-compatible math functionality
 221 | - find a way to improve emulation of intrinsics currently using "for (size_t i = 0;"
 222 | - minimize "#ifndef __cplusplus" usage in "ml.hlsli"
 223 | - GCC doesn't support "members with constructors in anonymous aggregates"
 224 | - search for TODO
 225 | */
 226 | 
 227 | //======================================================================================================================
 228 | // MathLib
 229 | //======================================================================================================================
 230 | 
 231 | #ifdef ML_NAMESPACE
 232 | namespace ml {
 233 | #endif
 234 | 
 235 | //======================================================================================================================
 236 | // Forward declarations
 237 | //======================================================================================================================
 238 | 
 239 | struct bool2;
 240 | struct bool3;
 241 | struct bool4;
 242 | 
 243 | union int2;
 244 | union int3;
 245 | union int4;
 246 | 
 247 | typedef uint32_t uint;
 248 | union uint2;
 249 | union uint3;
 250 | union uint4;
 251 | 
 252 | union float2;
 253 | union float3;
 254 | union float4;
 255 | union float4x4;
 256 | 
 257 | union double2;
 258 | union double3;
 259 | union double4;
 260 | union double4x4;
 261 | 
 262 | //======================================================================================================================
 263 | // Enums
 264 | //======================================================================================================================
 265 | 
 266 | enum eStyle : uint8_t {
 267 |     STYLE_D3D,
 268 |     STYLE_OGL,
 269 | };
 270 | 
 271 | enum eClip : uint8_t {
 272 |     CLIP_OUT,
 273 |     CLIP_IN,
 274 |     CLIP_PARTIAL,
 275 | };
 276 | 
 277 | enum eCoordinate : uint32_t {
 278 |     COORD_X = 0,
 279 |     COORD_Y,
 280 |     COORD_Z,
 281 |     COORD_W,
 282 | 
 283 |     COORD_2D = 2,
 284 |     COORD_3D,
 285 |     COORD_4D,
 286 | };
 287 | 
 288 | enum ePlaneType : uint32_t {
 289 |     PLANE_LEFT,
 290 |     PLANE_RIGHT,
 291 |     PLANE_BOTTOM,
 292 |     PLANE_TOP,
 293 |     PLANE_NEAR,
 294 |     PLANE_FAR,
 295 | 
 296 |     PLANES_NUM,
 297 |     PLANES_NO_NEAR_FAR = 4,
 298 |     PLANES_NO_FAR = 5,
 299 | 
 300 |     PLANE_MASK_L = 1 << PLANE_LEFT,
 301 |     PLANE_MASK_R = 1 << PLANE_RIGHT,
 302 |     PLANE_MASK_B = 1 << PLANE_BOTTOM,
 303 |     PLANE_MASK_T = 1 << PLANE_TOP,
 304 |     PLANE_MASK_N = 1 << PLANE_NEAR,
 305 |     PLANE_MASK_F = 1 << PLANE_FAR,
 306 | 
 307 |     PLANE_MASK_NONE = 0,
 308 |     PLANE_MASK_LRBT = PLANE_MASK_L | PLANE_MASK_R | PLANE_MASK_B | PLANE_MASK_T,
 309 |     PLANE_MASK_NF = PLANE_MASK_N | PLANE_MASK_F,
 310 |     PLANE_MASK_LRBTNF = PLANE_MASK_LRBT | PLANE_MASK_NF,
 311 | };
 312 | 
 313 | enum eProjectionData {
 314 |     PROJ_ZNEAR,
 315 |     PROJ_ZFAR,
 316 |     PROJ_ASPECT,
 317 |     PROJ_FOVX,
 318 |     PROJ_FOVY,
 319 |     PROJ_MINX,
 320 |     PROJ_MAXX,
 321 |     PROJ_MINY,
 322 |     PROJ_MAXY,
 323 |     PROJ_DIRX,
 324 |     PROJ_DIRY,
 325 |     PROJ_ANGLEMINX,
 326 |     PROJ_ANGLEMAXX,
 327 |     PROJ_ANGLEMINY,
 328 |     PROJ_ANGLEMAXY,
 329 | 
 330 |     PROJ_NUM,
 331 | };
 332 | 
 333 | enum eProjectionFlag {
 334 |     PROJ_ORTHO = 0x00000001,
 335 |     PROJ_REVERSED_Z = 0x00000002,
 336 |     PROJ_LEFT_HANDED = 0x00000004,
 337 | };
 338 | 
 339 | template <class T>
 340 | ML_INLINE void Swap(T& x, T& y) {
 341 |     T t = x;
 342 |     x = y;
 343 |     y = t;
 344 | }
 345 | 
 346 | //======================================================================================================================
 347 | // Intrinsic emulation
 348 | //======================================================================================================================
 349 | 
 350 | #include "Guts/emulation.h"
 351 | 
 352 | //======================================================================================================================
 353 | // Math
 354 | //======================================================================================================================
 355 | 
 356 | #include "Guts/math.h"
 357 | 
 358 | //======================================================================================================================
 359 | // Floating point tricks
 360 | //======================================================================================================================
 361 | 
 362 | union uFloat {
 363 |     float f;
 364 |     uint32_t i;
 365 | 
 366 |     ML_INLINE uFloat()
 367 |         : i(0) {
 368 |     }
 369 | 
 370 |     ML_INLINE uFloat(float x)
 371 |         : f(x) {
 372 |     }
 373 | 
 374 |     ML_INLINE uFloat(uint32_t x)
 375 |         : i(x) {
 376 |     }
 377 | 
 378 |     ML_INLINE void abs() {
 379 |         i &= ~(1 << 31);
 380 |     }
 381 | 
 382 |     ML_INLINE bool IsNegative() const {
 383 |         return (i >> 31) != 0;
 384 |     }
 385 | 
 386 |     ML_INLINE uint32_t Mantissa() const {
 387 |         return i & ((1 << 23) - 1);
 388 |     }
 389 | 
 390 |     ML_INLINE uint32_t Exponent() const {
 391 |         return (i >> 23) & 255;
 392 |     }
 393 | 
 394 |     ML_INLINE bool IsInf() const {
 395 |         return Exponent() == 255 && Mantissa() == 0;
 396 |     }
 397 | 
 398 |     ML_INLINE bool IsNan() const {
 399 |         return Exponent() == 255 && Mantissa() != 0;
 400 |     }
 401 | 
 402 |     static ML_INLINE float PrecisionGreater(float x) {
 403 |         uFloat y(x);
 404 |         y.i++;
 405 | 
 406 |         return y.f - x;
 407 |     }
 408 | 
 409 |     static ML_INLINE float PrecisionLess(float x) {
 410 |         uFloat y(x);
 411 |         y.i--;
 412 | 
 413 |         return y.f - x;
 414 |     }
 415 | };
 416 | 
 417 | union uDouble {
 418 |     double f;
 419 |     uint64_t i;
 420 | 
 421 |     ML_INLINE uDouble()
 422 |         : i(0) {
 423 |     }
 424 | 
 425 |     ML_INLINE uDouble(double x)
 426 |         : f(x) {
 427 |     }
 428 | 
 429 |     ML_INLINE uDouble(uint64_t x)
 430 |         : i(x) {
 431 |     }
 432 | 
 433 |     ML_INLINE bool IsNegative() const {
 434 |         return (i >> 63) != 0;
 435 |     }
 436 | 
 437 |     ML_INLINE void abs() {
 438 |         i &= ~(1ULL << 63);
 439 |     }
 440 | 
 441 |     ML_INLINE uint64_t Mantissa() const {
 442 |         return i & ((1ULL << 52) - 1);
 443 |     }
 444 | 
 445 |     ML_INLINE uint64_t Exponent() const {
 446 |         return (i >> 52) & 2047;
 447 |     }
 448 | 
 449 |     ML_INLINE bool IsInf() const {
 450 |         return Exponent() == 2047 && Mantissa() == 0;
 451 |     }
 452 | 
 453 |     ML_INLINE bool IsNan() const {
 454 |         return Exponent() == 2047 && Mantissa() != 0;
 455 |     }
 456 | 
 457 |     static ML_INLINE double PrecisionGreater(double x) {
 458 |         uDouble y(x);
 459 |         y.i++;
 460 | 
 461 |         return y.f - x;
 462 |     }
 463 | 
 464 |     static ML_INLINE double PrecisionLess(double x) {
 465 |         uDouble y(x);
 466 |         y.i--;
 467 | 
 468 |         return y.f - x;
 469 |     }
 470 | };
 471 | 
 472 | //======================================================================================================================
 473 | // Data types
 474 | //======================================================================================================================
 475 | 
 476 | #define ML_COMPARE_UNOPT(B, C, op) \
 477 |     ML_INLINE B operator op(const C& v) const { \
 478 |         int32_t mask = x op v.x ? 0x1 : 0; \
 479 |         mask |= y op v.y ? 0x2 : 0; \
 480 |         return B(mask); \
 481 |     }
 482 | 
 483 | #define ML_COMPARE(B, C, op, f, movemask, reg) \
 484 |     ML_INLINE B operator op(const C& v) const { \
 485 |         return B(movemask(f(reg, v.reg))); \
 486 |     }
 487 | 
 488 | #define ML_OP_UNOPT(C, T, op, opeq) \
 489 |     ML_INLINE C operator op(const C& v) const { \
 490 |         return C(x op v.x, y op v.y); \
 491 |     } \
 492 |     ML_INLINE friend C operator op(T c, const C& v) { \
 493 |         return C(c op v.x, c op v.y); \
 494 |     } \
 495 |     ML_INLINE friend C operator op(const C& v, T c) { \
 496 |         return C(v.x op c, v.y op c); \
 497 |     } \
 498 |     ML_INLINE void operator opeq(const C& v) { \
 499 |         x opeq v.x; \
 500 |         y opeq v.y; \
 501 |     } \
 502 |     ML_INLINE void operator opeq(T c) { \
 503 |         x opeq c; \
 504 |         y opeq c; \
 505 |     }
 506 | 
 507 | #define ML_OP(C, T, op, opeq, f, broadcast, reg) \
 508 |     ML_INLINE C operator op(const C& v) const { \
 509 |         return f(reg, v.reg); \
 510 |     } \
 511 |     ML_INLINE friend C operator op(T c, const C& v) { \
 512 |         return f(broadcast(c), v.reg); \
 513 |     } \
 514 |     ML_INLINE friend C operator op(const C& v, T c) { \
 515 |         return f(v.reg, broadcast(c)); \
 516 |     } \
 517 |     ML_INLINE void operator opeq(const C& v) { \
 518 |         reg = f(reg, v.reg); \
 519 |     } \
 520 |     ML_INLINE void operator opeq(T c) { \
 521 |         reg = f(reg, broadcast(c)); \
 522 |     }
 523 | 
 524 | // Vector swizzling
 525 | #include "Guts/swizzle.h"
 526 | 
 527 | // Boolean (1 bit emulation)
 528 | #include "Guts/bool1.h"
 529 | 
 530 | // Integer
 531 | #include "Guts/i32.h"
 532 | #include "Guts/u32.h"
 533 | 
 534 | // Float
 535 | #include "Guts/f16.h"
 536 | #include "Guts/f32.h"
 537 | #include "Guts/f64.h"
 538 | 
 539 | // Conversion
 540 | #include "Guts/conversion.h"
 541 | 
 542 | #undef ML_COMPARE_UNOPT
 543 | #undef ML_COMPARE
 544 | #undef ML_OP_UNOPT
 545 | #undef ML_OP
 546 | 
 547 | #undef ML_SWIZZLE_2
 548 | #undef ML_SWIZZLE_3
 549 | #undef ML_SWIZZLE_4
 550 | 
 551 | #undef ML_X
 552 | #undef ML_Y
 553 | #undef ML_Z
 554 | #undef ML_W
 555 | 
 556 | //======================================================================================================================
 557 | // Misc
 558 | //======================================================================================================================
 559 | 
 560 | template <class T>
 561 | ML_INLINE T CurveSmooth(const T& x) {
 562 |     return x * x * (3.0 - 2.0 * x);
 563 | }
 564 | 
 565 | template <class T>
 566 | ML_INLINE T CurveSin(const T& x) {
 567 |     return x * (1.0 - x * x / 3.0);
 568 | }
 569 | 
 570 | template <class T>
 571 | ML_INLINE T WaveTriangle(const T& x) {
 572 |     return abs(frac(x + T(0.5)) * T(2.0) - T(1.0));
 573 | }
 574 | 
 575 | template <class T>
 576 | ML_INLINE T WaveTriangleSmooth(const T& x) {
 577 |     return CurveSmooth(WaveTriangle(x));
 578 | }
 579 | 
 580 | ML_INLINE float DoubleToGequal(double dValue) {
 581 |     float fValue = (float)dValue;
 582 |     float fError = (float)(dValue - fValue);
 583 | 
 584 |     int32_t exponent = 0;
 585 |     frexp(fValue, &exponent);
 586 |     exponent = max(exponent, 0);
 587 |     exponent = (int32_t)log10f(float(1 << exponent));
 588 | 
 589 |     float fStep = 1.0f / pow(10.0f, float(7 - exponent));
 590 | 
 591 |     while (fError > 0.0f) {
 592 |         fValue += fStep;
 593 | 
 594 |         float fCurrError = float(dValue - fValue);
 595 | 
 596 |         if (fCurrError == fError)
 597 |             fStep += fStep;
 598 |         else
 599 |             fError = fCurrError;
 600 |     }
 601 | 
 602 |     return fValue;
 603 | }
 604 | 
 605 | ML_INLINE float DoubleToLequal(double dValue) {
 606 |     float fValue = (float)dValue;
 607 |     float fError = (float)(dValue - fValue);
 608 | 
 609 |     int32_t exponent = 0;
 610 |     frexp(fValue, &exponent);
 611 |     exponent = max(exponent, 0);
 612 |     exponent = (int32_t)log10f(float(1 << exponent));
 613 | 
 614 |     float fStep = 1.0f / pow(10.0f, float(7 - exponent));
 615 | 
 616 |     while (fError < 0.0f) {
 617 |         fValue -= fStep;
 618 | 
 619 |         float fCurrError = float(dValue - fValue);
 620 | 
 621 |         if (fCurrError == fError)
 622 |             fStep += fStep;
 623 |         else
 624 |             fError = fCurrError;
 625 |     }
 626 | 
 627 |     return fValue;
 628 | }
 629 | 
 630 | //======================================================================================================================
 631 | // Rect
 632 | //======================================================================================================================
 633 | 
 634 | template <class T>
 635 | class ctRect {
 636 | public:
 637 |     union {
 638 |         struct {
 639 |             T vMin[COORD_2D];
 640 |         };
 641 | 
 642 |         struct {
 643 |             T minx;
 644 |             T miny;
 645 |         };
 646 |     };
 647 | 
 648 |     union {
 649 |         struct {
 650 |             T vMax[COORD_2D];
 651 |         };
 652 | 
 653 |         struct {
 654 |             T maxx;
 655 |             T maxy;
 656 |         };
 657 |     };
 658 | 
 659 | public:
 660 |     ML_INLINE ctRect() {
 661 |         Clear();
 662 |     }
 663 | 
 664 |     ML_INLINE void Clear() {
 665 |         minx = miny = T(1 << 30);
 666 |         maxx = maxy = T(-(1 << 30));
 667 |     }
 668 | 
 669 |     ML_INLINE bool IsValid() const {
 670 |         return maxx > minx && maxy > miny;
 671 |     }
 672 | 
 673 |     ML_INLINE void Add(T px, T py) {
 674 |         minx = min(minx, px);
 675 |         maxx = max(maxx, px);
 676 |         miny = min(miny, py);
 677 |         maxy = max(maxy, py);
 678 |     }
 679 | 
 680 |     ML_INLINE void Add(const T* pPoint2) {
 681 |         Add(pPoint2[0], pPoint2[1]);
 682 |     }
 683 | 
 684 |     ML_INLINE bool IsIntersectWith(const T* pMin, const T* pMax) const {
 685 |         ML_Assert(IsValid());
 686 | 
 687 |         if (maxx < pMin[0] || maxy < pMin[1] || minx > pMax[0] || miny > pMax[1])
 688 |             return false;
 689 | 
 690 |         return true;
 691 |     }
 692 | 
 693 |     ML_INLINE bool IsIntersectWith(const ctRect<T>& rRect) const {
 694 |         return IsIntersectWith(rRect.vMin, rRect.vMax);
 695 |     }
 696 | 
 697 |     ML_INLINE eClip GetIntersectionStateWith(const T* pMin, const T* pMax) const {
 698 |         ML_Assert(IsValid());
 699 | 
 700 |         if (!IsIntersectWith(pMin, pMax))
 701 |             return CLIP_OUT;
 702 | 
 703 |         if (minx < pMin[0] && maxx > pMax[0] && miny < pMin[1] && maxy > pMax[1])
 704 |             return CLIP_IN;
 705 | 
 706 |         return CLIP_PARTIAL;
 707 |     }
 708 | 
 709 |     ML_INLINE eClip GetIntersectionStateWith(const ctRect<T>& rRect) const {
 710 |         return GetIntersectionStateWith(rRect.vMin, rRect.vMax);
 711 |     }
 712 | };
 713 | 
 714 | //======================================================================================================================
 715 | // Frustum
 716 | //======================================================================================================================
 717 | 
 718 | ML_INLINE bool MvpToPlanes(eStyle depthStyle, const float4x4& m, float4* pvPlane6) {
 719 |     const float eps = 1e-7f;
 720 | 
 721 |     float4x4 mt;
 722 |     m.TransposeTo(mt);
 723 | 
 724 |     float4 l = mt[3] + mt[0];
 725 |     float4 r = mt[3] - mt[0];
 726 |     float4 b = mt[3] + mt[1];
 727 |     float4 t = mt[3] - mt[1];
 728 |     float4 f = mt[3] - mt[2];
 729 |     float4 n = mt[2];
 730 | 
 731 |     if (depthStyle == STYLE_OGL)
 732 |         n += mt[3];
 733 | 
 734 |     // Side planes
 735 |     l *= rsqrt(dot(l.xyz, l.xyz));
 736 |     r *= rsqrt(dot(r.xyz, r.xyz));
 737 |     b *= rsqrt(dot(b.xyz, b.xyz));
 738 |     t *= rsqrt(dot(t.xyz, t.xyz));
 739 | 
 740 |     // Near & far planes
 741 |     n /= max(length(n.xyz), eps);
 742 |     f /= max(length(f.xyz), eps);
 743 | 
 744 |     // Handle reversed projection
 745 |     bool bReversed = abs(n.w) > abs(f.w);
 746 | 
 747 |     if (bReversed)
 748 |         Swap(n, f);
 749 | 
 750 |     // Handle infinite projection
 751 |     if (length(f.xyz) < eps)
 752 |         f = float4(-n.x, -n.y, -n.z, f.w);
 753 | 
 754 |     pvPlane6[PLANE_LEFT] = l;
 755 |     pvPlane6[PLANE_RIGHT] = r;
 756 |     pvPlane6[PLANE_BOTTOM] = b;
 757 |     pvPlane6[PLANE_TOP] = t;
 758 |     pvPlane6[PLANE_NEAR] = n;
 759 |     pvPlane6[PLANE_FAR] = f;
 760 | 
 761 |     return bReversed;
 762 | }
 763 | 
 764 | class cFrustum {
 765 | private:
 766 |     float4 m_vPlane[PLANES_NUM] = {};
 767 |     float4x4 m_mPlanesT = {};
 768 |     v4f m_vMask[PLANES_NUM] = {};
 769 | 
 770 | public:
 771 |     ML_INLINE void Setup(eStyle depthStyle, const float4x4& mMvp) {
 772 |         MvpToPlanes(depthStyle, mMvp, m_vPlane);
 773 | 
 774 |         m_mPlanesT[0] = m_vPlane[PLANE_LEFT];
 775 |         m_mPlanesT[1] = m_vPlane[PLANE_RIGHT];
 776 |         m_mPlanesT[2] = m_vPlane[PLANE_BOTTOM];
 777 |         m_mPlanesT[3] = m_vPlane[PLANE_TOP];
 778 |         m_mPlanesT.Transpose();
 779 | 
 780 |         for (uint32_t i = 0; i < PLANES_NUM; i++)
 781 |             m_vMask[i] = _mm_cmpgt_ps(m_vPlane[i].xmm, _mm_setzero_ps());
 782 |     }
 783 | 
 784 |     ML_INLINE void Translate(const float3& vPos) {
 785 |         // Update of m_vMask is not required, because only m_vMask.w can be changed, but this component doesn't affect results
 786 |         for (uint32_t i = 0; i < PLANES_NUM; i++)
 787 |             m_vPlane[i].w = Dot43(m_vPlane[i], vPos);
 788 |     }
 789 | 
 790 |     ML_INLINE bool CheckSphere(const float3& center, float fRadius, uint32_t planes = PLANES_NUM) const {
 791 |         v4f p1 = v4f_setw1(center.xmm);
 792 | 
 793 |         for (uint32_t i = 0; i < planes; i++) {
 794 |             float d = dot(m_vPlane[i], p1);
 795 | 
 796 |             if (d < -fRadius)
 797 |                 return false;
 798 |         }
 799 | 
 800 |         return true;
 801 |     }
 802 | 
 803 |     ML_INLINE bool CheckAabb(const float3& minv, const float3& maxv, uint32_t planes) const {
 804 |         v4f min1 = v4f_setw1(minv.xmm);
 805 |         v4f max1 = v4f_setw1(maxv.xmm);
 806 | 
 807 |         for (uint32_t i = 0; i < planes; i++) {
 808 |             v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]);
 809 |             v = v4f_dot44(m_vPlane[i].xmm, v);
 810 | 
 811 |             if (v4f_isnegative1_all(v))
 812 |                 return false;
 813 |         }
 814 | 
 815 |         return true;
 816 |     }
 817 | 
 818 |     ML_INLINE bool CheckCapsule(const float3& capsule_start, const float3& capsule_axis, float capsule_radius, uint32_t planes) const {
 819 |         // https://github.com/toxygen/STA/blob/master/celestia-src/celmath/frustum.cpp
 820 | 
 821 |         float r2 = capsule_radius * capsule_radius;
 822 |         float3 capsule_end = capsule_start + capsule_axis;
 823 | 
 824 |         for (uint32_t i = 0; i < planes; i++) {
 825 |             float signedDist0 = Dot43(m_vPlane[i], capsule_start);
 826 |             float signedDist1 = Dot43(m_vPlane[i], capsule_end);
 827 | 
 828 |             if (signedDist0 * signedDist1 > r2) {
 829 |                 if (abs(signedDist0) <= abs(signedDist1)) {
 830 |                     if (signedDist0 < -capsule_radius)
 831 |                         return false;
 832 |                 } else {
 833 |                     if (signedDist1 < -capsule_radius)
 834 |                         return false;
 835 |                 }
 836 |             }
 837 |         }
 838 | 
 839 |         return true;
 840 |     }
 841 | 
 842 |     ML_INLINE bool CheckSphere_mask(const float3& center, float fRadius, uint32_t mask, uint32_t planes) const {
 843 |         v4f p1 = v4f_setw1(center.xmm);
 844 | 
 845 |         for (uint32_t i = 0; i < planes; i++) {
 846 |             if (!(mask & (1 << i))) {
 847 |                 float d = dot(m_vPlane[i], p1);
 848 | 
 849 |                 if (d < -fRadius)
 850 |                     return false;
 851 |             }
 852 |         }
 853 | 
 854 |         return true;
 855 |     }
 856 | 
 857 |     ML_INLINE bool CheckAabb_mask(const float3& minv, const float3& maxv, uint32_t mask, uint32_t planes) const {
 858 |         v4f min1 = v4f_setw1(minv.xmm);
 859 |         v4f max1 = v4f_setw1(maxv.xmm);
 860 | 
 861 |         for (uint32_t i = 0; i < planes; i++) {
 862 |             if (!(mask & (1 << i))) {
 863 |                 v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]);
 864 |                 v = v4f_dot44(m_vPlane[i].xmm, v);
 865 | 
 866 |                 if (v4f_isnegative1_all(v))
 867 |                     return false;
 868 |             }
 869 |         }
 870 | 
 871 |         return true;
 872 |     }
 873 | 
 874 |     ML_INLINE eClip CheckSphere_state(const float3& center, float fRadius, uint32_t planes) const {
 875 |         v4f p1 = v4f_setw1(center.xmm);
 876 | 
 877 |         eClip clip = CLIP_IN;
 878 | 
 879 |         for (uint32_t i = 0; i < planes; i++) {
 880 |             float d = dot(m_vPlane[i], p1);
 881 | 
 882 |             if (d < -fRadius)
 883 |                 return CLIP_OUT;
 884 | 
 885 |             if (d < fRadius)
 886 |                 clip = CLIP_PARTIAL;
 887 |         }
 888 | 
 889 |         return clip;
 890 |     }
 891 | 
 892 |     ML_INLINE eClip CheckAabb_state(const float3& minv, const float3& maxv, uint32_t planes) const {
 893 |         v4f min1 = v4f_setw1(minv.xmm);
 894 |         v4f max1 = v4f_setw1(maxv.xmm);
 895 | 
 896 |         eClip clip = CLIP_IN;
 897 | 
 898 |         for (uint32_t i = 0; i < planes; i++) {
 899 |             v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]);
 900 |             v = v4f_dot44(m_vPlane[i].xmm, v);
 901 | 
 902 |             if (v4f_isnegative1_all(v))
 903 |                 return CLIP_OUT;
 904 | 
 905 |             v = _mm_blendv_ps(max1, min1, m_vMask[i]);
 906 |             v = v4f_dot44(m_vPlane[i].xmm, v);
 907 | 
 908 |             if (v4f_isnegative1_all(v))
 909 |                 clip = CLIP_PARTIAL;
 910 |         }
 911 | 
 912 |         return clip;
 913 |     }
 914 | 
 915 |     ML_INLINE eClip CheckCapsule_state(const float3& capsule_start, const float3& capsule_axis, float capsule_radius, uint32_t planes) const {
 916 |         float r2 = capsule_radius * capsule_radius;
 917 |         float3 capsule_end = capsule_start + capsule_axis;
 918 | 
 919 |         uint32_t intersections = 0;
 920 | 
 921 |         for (uint32_t i = 0; i < planes; i++) {
 922 |             float signedDist0 = Dot43(m_vPlane[i], capsule_start);
 923 |             float signedDist1 = Dot43(m_vPlane[i], capsule_end);
 924 | 
 925 |             if (signedDist0 * signedDist1 > r2) {
 926 |                 // Endpoints of capsule are on same side of plane. Test closest endpoint to see if it lies closer to the plane than radius
 927 |                 if (abs(signedDist0) <= abs(signedDist1)) {
 928 |                     if (signedDist0 < -capsule_radius)
 929 |                         return CLIP_OUT;
 930 |                     else if (signedDist0 < capsule_radius)
 931 |                         intersections |= (1 << i);
 932 |                 } else {
 933 |                     if (signedDist1 < -capsule_radius)
 934 |                         return CLIP_OUT;
 935 |                     else if (signedDist1 < capsule_radius)
 936 |                         intersections |= (1 << i);
 937 |                 }
 938 |             } else {
 939 |                 // Capsule endpoints are on different sides of the plane, so we have an intersection
 940 |                 intersections |= (1 << i);
 941 |             }
 942 |         }
 943 | 
 944 |         return !intersections ? CLIP_IN : CLIP_PARTIAL;
 945 |     }
 946 | 
 947 |     ML_INLINE eClip CheckSphere_mask_state(const float3& center, float fRadius, uint32_t& mask, uint32_t planes) const {
 948 |         v4f p1 = v4f_setw1(center.xmm);
 949 | 
 950 |         eClip clip = CLIP_IN;
 951 | 
 952 |         for (uint32_t i = 0; i < planes; i++) {
 953 |             if (!(mask & (1 << i))) {
 954 |                 float d = dot(m_vPlane[i], p1);
 955 | 
 956 |                 if (d < -fRadius)
 957 |                     return CLIP_OUT;
 958 | 
 959 |                 if (d < fRadius)
 960 |                     clip = CLIP_PARTIAL;
 961 |                 else
 962 |                     mask |= 1 << i;
 963 |             }
 964 |         }
 965 | 
 966 |         return clip;
 967 |     }
 968 | 
 969 |     ML_INLINE eClip CheckAabb_mask_state(const float3& minv, const float3& maxv, uint32_t& mask, uint32_t planes) const {
 970 |         v4f min1 = v4f_setw1(minv.xmm);
 971 |         v4f max1 = v4f_setw1(maxv.xmm);
 972 | 
 973 |         eClip result = CLIP_IN;
 974 | 
 975 |         for (uint32_t i = 0; i < planes; i++) {
 976 |             if (!(mask & (1 << i))) {
 977 |                 v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]);
 978 |                 v = v4f_dot44(m_vPlane[i].xmm, v);
 979 | 
 980 |                 if (v4f_isnegative1_all(v))
 981 |                     return CLIP_OUT;
 982 | 
 983 |                 v = _mm_blendv_ps(max1, min1, m_vMask[i]);
 984 |                 v = v4f_dot44(m_vPlane[i].xmm, v);
 985 | 
 986 |                 if (v4f_isnegative1_all(v))
 987 |                     result = CLIP_PARTIAL;
 988 |                 else
 989 |                     mask |= 1 << i;
 990 |             }
 991 |         }
 992 | 
 993 |         return result;
 994 |     }
 995 | 
 996 |     ML_INLINE void SetNearFar(float zNearNeg, float zFarNeg) {
 997 |         m_vPlane[PLANE_NEAR].w = zNearNeg;
 998 |         m_vPlane[PLANE_FAR].w = -zFarNeg;
 999 |     }
1000 | 
1001 |     ML_INLINE void SetFar(float zFarNeg) {
1002 |         m_vPlane[PLANE_FAR].w = -zFarNeg;
1003 |     }
1004 | 
1005 |     ML_INLINE const float4& GetPlane(uint32_t plane) {
1006 |         ML_Assert(plane < PLANES_NUM);
1007 | 
1008 |         return m_vPlane[plane];
1009 |     }
1010 | };
1011 | 
1012 | ML_INLINE void DecomposeProjection(eStyle originStyle, eStyle depthStyle, const float4x4& proj, uint32_t* puiFlags, float* pfSettings15, float* pfUnproject2, float* pfFrustum4,
1013 |     float* pfProject3, float* pfSafeNearZ) {
1014 |     float4 vPlane[PLANES_NUM];
1015 |     bool bReversedZ = MvpToPlanes(depthStyle, proj, vPlane);
1016 | 
1017 |     bool bIsOrtho = proj.a33 == 1.0f ? true : false;
1018 | 
1019 |     float fNearZ = -vPlane[PLANE_NEAR].w;
1020 |     float fFarZ = vPlane[PLANE_FAR].w;
1021 | 
1022 |     float x0, x1, y0, y1;
1023 |     if (bIsOrtho) {
1024 |         x0 = -vPlane[PLANE_LEFT].w;
1025 |         x1 = vPlane[PLANE_RIGHT].w;
1026 |         y0 = -vPlane[PLANE_BOTTOM].w;
1027 |         y1 = vPlane[PLANE_TOP].w;
1028 | 
1029 |         if (proj.a11 < 0.0f)
1030 |             Swap(y0, y1);
1031 |     } else {
1032 |         x0 = vPlane[PLANE_LEFT].z / vPlane[PLANE_LEFT].x;
1033 |         x1 = vPlane[PLANE_RIGHT].z / vPlane[PLANE_RIGHT].x;
1034 |         y0 = vPlane[PLANE_BOTTOM].z / vPlane[PLANE_BOTTOM].y;
1035 |         y1 = vPlane[PLANE_TOP].z / vPlane[PLANE_TOP].y;
1036 |     }
1037 | 
1038 |     // const float3& col2 = bReversedZ ? proj.col3 : proj.col2;
1039 |     float4 clip = proj * float4(0.0f, 0.0f, fNearZ, 1.0f);
1040 |     float3 col2 = bIsOrtho ? float3(proj.Col(2)) * (bReversedZ ? -1.0f : 1.0f) : float3(0.0f, 0.0f, clip.w > 0.0f ? 1.0f : -1.0f);
1041 |     bool cmp = dot(cross(float3(proj.Col(0)), float3(proj.Col(1))), col2.xyz) > 0.0f;
1042 |     bool bLeftHanded = proj.a11 > 0.0f ? cmp : !cmp;
1043 | 
1044 |     if (puiFlags) {
1045 |         *puiFlags = bIsOrtho ? PROJ_ORTHO : 0;
1046 |         *puiFlags |= bReversedZ ? PROJ_REVERSED_Z : 0;
1047 |         *puiFlags |= bLeftHanded ? PROJ_LEFT_HANDED : 0;
1048 |     }
1049 | 
1050 |     if (pfUnproject2) {
1051 |         // z = u0 / (depth + u1)
1052 | 
1053 |         pfUnproject2[0] = ML_DEPTH_C0 * proj.a23 / proj.a32;
1054 |         pfUnproject2[1] = -(ML_DEPTH_C0 * proj.a22 / proj.a32 + ML_DEPTH_C1);
1055 | 
1056 |         // z = 1 / (depth * u0 + u1);
1057 | 
1058 |         // pfUnproject2[0] = proj.a32 / (ML_DEPTH_C0 * proj.a23);
1059 |         // pfUnproject2[1] = -(proj.a22 / proj.a23 + ML_DEPTH_C1 / pfUnproject2[0]);
1060 |     }
1061 | 
1062 |     if (pfSafeNearZ) {
1063 |         *pfSafeNearZ = fNearZ - ML_DEPTH_EPS;
1064 | 
1065 |         if (!bIsOrtho) {
1066 |             float maxx = max(abs(x0), abs(x1));
1067 |             float maxy = max(abs(y0), abs(y1));
1068 | 
1069 |             *pfSafeNearZ *= sqrt(maxx * maxx + maxy * maxy + 1.0f);
1070 |         }
1071 |     }
1072 | 
1073 |     if (pfProject3) {
1074 |         // IMPORTANT: Rg - geometry radius, Rp - projected radius, Rn - projected normalized radius
1075 |         //      keep in mind:
1076 |         //          zp = -(mView * p).z
1077 |         //          zp_fix = mix(zp, 1.0, bIsOrtho), or
1078 |         //          zp_fix = (mViewProj * p).w
1079 |         //      project:
1080 |         //          Rn.x = Rg * pfProject3[0] / zp_fix
1081 |         //          Rn.y = Rg * pfProject3[1] / zp_fix
1082 |         //          Rp = 0.5 * viewport.w * Rn.x, or
1083 |         //          Rp = 0.5 * viewport.h * Rn.y, or
1084 |         //          Rp = Rg * K / zp_fix
1085 |         //      unproject:
1086 |         //          Rn.x = 2.0 * Rp / viewport.w
1087 |         //          Rn.y = 2.0 * Rp / viewport.h
1088 |         //          Rg = Rn.x * zp_fix / pfProject3[0], or
1089 |         //          Rg = Rn.y * zp_fix / pfProject3[1], or
1090 |         //          Rg = Rp * zp_fix / K
1091 |         //      K = 0.5 * viewport.w * pfProject3[0] = 0.5 * viewport.h * pfProject3[1]
1092 | 
1093 |         float fProjectx = 2.0f / (x1 - x0);
1094 |         float fProjecty = 2.0f / (y1 - y0);
1095 | 
1096 |         pfProject3[0] = abs(fProjectx);
1097 |         pfProject3[1] = abs(fProjecty);
1098 |         pfProject3[2] = bIsOrtho ? 1.0f : 0.0f;
1099 |     }
1100 | 
1101 |     if (pfFrustum4) {
1102 |         // IMPORTANT: view space position from screen space uv [0, 1]
1103 |         //          ray.xy = (pfFrustum4.zw * uv + pfFrustum4.xy) * mix(zDistanceNeg, -1.0, bIsOrtho)
1104 |         //          ray.z = 1.0 * zDistanceNeg
1105 | 
1106 |         pfFrustum4[0] = -x0;
1107 |         pfFrustum4[2] = x0 - x1;
1108 | 
1109 |         if (originStyle == STYLE_D3D) {
1110 |             pfFrustum4[1] = -y1;
1111 |             pfFrustum4[3] = y1 - y0;
1112 |         } else {
1113 |             pfFrustum4[1] = -y0;
1114 |             pfFrustum4[3] = y0 - y1;
1115 |         }
1116 |     }
1117 | 
1118 |     if (pfSettings15) {
1119 |         // Swap is possible, because it is the last pass...
1120 |         if (x0 > x1)
1121 |             Swap(x0, x1);
1122 | 
1123 |         if (y0 > y1)
1124 |             Swap(y0, y1);
1125 | 
1126 |         float fAngleY0 = atan(bIsOrtho ? 0.0f : y0);
1127 |         float fAngleY1 = atan(bIsOrtho ? 0.0f : y1);
1128 |         float fAngleX0 = atan(bIsOrtho ? 0.0f : x0);
1129 |         float fAngleX1 = atan(bIsOrtho ? 0.0f : x1);
1130 | 
1131 |         float fAspect = (x1 - x0) / (y1 - y0);
1132 | 
1133 |         pfSettings15[PROJ_ZNEAR] = fNearZ;
1134 |         pfSettings15[PROJ_ZFAR] = fFarZ;
1135 |         pfSettings15[PROJ_ASPECT] = fAspect;
1136 |         pfSettings15[PROJ_FOVX] = fAngleX1 - fAngleX0;
1137 |         pfSettings15[PROJ_FOVY] = fAngleY1 - fAngleY0;
1138 |         pfSettings15[PROJ_MINX] = x0 * fNearZ;
1139 |         pfSettings15[PROJ_MAXX] = x1 * fNearZ;
1140 |         pfSettings15[PROJ_MINY] = y0 * fNearZ;
1141 |         pfSettings15[PROJ_MAXY] = y1 * fNearZ;
1142 |         pfSettings15[PROJ_ANGLEMINX] = fAngleX0;
1143 |         pfSettings15[PROJ_ANGLEMAXX] = fAngleX1;
1144 |         pfSettings15[PROJ_ANGLEMINY] = fAngleY0;
1145 |         pfSettings15[PROJ_ANGLEMAXY] = fAngleY1;
1146 |         pfSettings15[PROJ_DIRX] = (fAngleX0 + fAngleX1) * 0.5f;
1147 |         pfSettings15[PROJ_DIRY] = (fAngleY0 + fAngleY1) * 0.5f;
1148 |     }
1149 | }
1150 | 
1151 | #include "Guts/other.h"
1152 | #include "Guts/packing.h"
1153 | #include "Guts/sorting.h"
1154 | 
1155 | #ifdef ML_NAMESPACE
1156 | }
1157 | #endif
1158 | 
1159 | //======================================================================================================================
1160 | // End
1161 | //======================================================================================================================
1162 | 
1163 | #if defined(__GNUC__)
1164 | #    pragma GCC diagnostic pop
1165 | #elif defined(__clang__)
1166 | #    pragma clang diagnostic pop
1167 | #else
1168 | #    pragma warning(pop)
1169 | #endif
1170 | 


--------------------------------------------------------------------------------