├── .clang-format ├── CMakeLists.txt ├── Guts ├── bool1.h ├── conversion.h ├── emulation.h ├── f16.h ├── f32.h ├── f64.h ├── i32.h ├── math.h ├── other.h ├── packing.h ├── sorting.h ├── swizzle.h ├── tests.h └── u32.h ├── LICENSE.txt ├── README.md ├── ml.h └── ml.hlsli /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | AccessModifierOffset: -4 4 | AlignAfterOpenBracket: DontAlign 5 | AlignArrayOfStructures: None 6 | AlignConsecutiveAssignments: 7 | Enabled: false 8 | AcrossEmptyLines: false 9 | AcrossComments: false 10 | AlignCompound: false 11 | AlignFunctionPointers: false 12 | PadOperators: true 13 | AlignConsecutiveBitFields: 14 | Enabled: true 15 | AcrossEmptyLines: false 16 | AcrossComments: false 17 | AlignCompound: false 18 | AlignFunctionPointers: false 19 | PadOperators: false 20 | AlignConsecutiveDeclarations: 21 | Enabled: false 22 | AcrossEmptyLines: false 23 | AcrossComments: false 24 | AlignCompound: false 25 | AlignFunctionPointers: false 26 | PadOperators: false 27 | AlignConsecutiveMacros: 28 | Enabled: true 29 | AcrossEmptyLines: false 30 | AcrossComments: false 31 | AlignCompound: false 32 | AlignFunctionPointers: false 33 | PadOperators: false 34 | AlignConsecutiveShortCaseStatements: 35 | Enabled: false 36 | AcrossEmptyLines: false 37 | AcrossComments: false 38 | AlignCaseArrows: false 39 | AlignCaseColons: false 40 | AlignConsecutiveTableGenBreakingDAGArgColons: 41 | Enabled: false 42 | AcrossEmptyLines: false 43 | AcrossComments: false 44 | AlignCompound: false 45 | AlignFunctionPointers: false 46 | PadOperators: false 47 | AlignConsecutiveTableGenCondOperatorColons: 48 | Enabled: false 49 | AcrossEmptyLines: false 50 | AcrossComments: false 51 | AlignCompound: false 52 | AlignFunctionPointers: false 53 | PadOperators: false 54 | AlignConsecutiveTableGenDefinitionColons: 55 | Enabled: false 56 | AcrossEmptyLines: false 57 | AcrossComments: false 58 | AlignCompound: false 59 | AlignFunctionPointers: false 60 | PadOperators: false 61 | AlignEscapedNewlines: DontAlign 62 | AlignOperands: DontAlign 63 | AlignTrailingComments: 64 | Kind: Always 65 | OverEmptyLines: 0 66 | AllowAllArgumentsOnNextLine: true 67 | AllowAllParametersOfDeclarationOnNextLine: true 68 | AllowBreakBeforeNoexceptSpecifier: Never 69 | AllowShortBlocksOnASingleLine: Never 70 | AllowShortCaseExpressionOnASingleLine: true 71 | AllowShortCaseLabelsOnASingleLine: false 72 | AllowShortCompoundRequirementOnASingleLine: true 73 | AllowShortEnumsOnASingleLine: false 74 | AllowShortFunctionsOnASingleLine: None 75 | AllowShortIfStatementsOnASingleLine: Never 76 | AllowShortLambdasOnASingleLine: All 77 | AllowShortLoopsOnASingleLine: false 78 | AlwaysBreakAfterDefinitionReturnType: None 79 | AlwaysBreakBeforeMultilineStrings: true 80 | AttributeMacros: 81 | - CPP 82 | - M256_ALIGN 83 | BinPackArguments: true 84 | BinPackParameters: true 85 | BitFieldColonSpacing: Both 86 | BraceWrapping: 87 | AfterCaseLabel: false 88 | AfterClass: false 89 | AfterControlStatement: Never 90 | AfterEnum: false 91 | AfterFunction: false 92 | AfterNamespace: false 93 | AfterObjCDeclaration: false 94 | AfterStruct: false 95 | AfterUnion: false 96 | AfterExternBlock: false 97 | BeforeCatch: false 98 | BeforeElse: false 99 | BeforeLambdaBody: false 100 | BeforeWhile: false 101 | IndentBraces: false 102 | SplitEmptyFunction: true 103 | SplitEmptyRecord: true 104 | SplitEmptyNamespace: true 105 | BreakAdjacentStringLiterals: true 106 | BreakAfterAttributes: Leave 107 | BreakAfterJavaFieldAnnotations: false 108 | BreakAfterReturnType: None 109 | BreakArrays: true 110 | BreakBeforeBinaryOperators: All 111 | BreakBeforeBraces: Attach 112 | BreakBeforeConceptDeclarations: Always 113 | BreakBeforeInlineASMColon: OnlyMultiline 114 | BreakBeforeTernaryOperators: true 115 | BreakConstructorInitializers: BeforeComma 116 | BreakFunctionDefinitionParameters: false 117 | BreakInheritanceList: BeforeColon 118 | BreakStringLiterals: true 119 | BreakTemplateDeclarations: Yes 120 | ColumnLimit: 0 121 | CommentPragmas: "^ IWYU pragma:" 122 | CompactNamespaces: false 123 | ConstructorInitializerIndentWidth: 4 124 | ContinuationIndentWidth: 4 125 | Cpp11BracedListStyle: true 126 | DerivePointerAlignment: false 127 | DisableFormat: false 128 | EmptyLineAfterAccessModifier: Never 129 | EmptyLineBeforeAccessModifier: LogicalBlock 130 | ExperimentalAutoDetectBinPacking: false 131 | FixNamespaceComments: true 132 | ForEachMacros: 133 | - BLABLA 134 | IfMacros: 135 | - BLABLA 136 | IncludeBlocks: Preserve 137 | IncludeCategories: 138 | - Regex: ^ 139 | Priority: 2 140 | SortPriority: 0 141 | CaseSensitive: false 142 | - Regex: ^<.*\.h> 143 | Priority: 1 144 | SortPriority: 0 145 | CaseSensitive: false 146 | - Regex: ^<.* 147 | Priority: 2 148 | SortPriority: 0 149 | CaseSensitive: false 150 | - Regex: .* 151 | Priority: 3 152 | SortPriority: 0 153 | CaseSensitive: false 154 | IncludeIsMainRegex: ([-_](test|unittest))?$ 155 | IncludeIsMainSourceRegex: "" 156 | IndentAccessModifiers: false 157 | IndentCaseBlocks: false 158 | IndentCaseLabels: true 159 | IndentExternBlock: NoIndent 160 | IndentGotoLabels: true 161 | IndentPPDirectives: AfterHash 162 | IndentRequiresClause: true 163 | IndentWidth: 4 164 | IndentWrappedFunctionNames: false 165 | InsertBraces: false 166 | InsertNewlineAtEOF: false 167 | InsertTrailingCommas: None 168 | IntegerLiteralSeparator: 169 | Binary: 0 170 | BinaryMinDigits: 0 171 | Decimal: 0 172 | DecimalMinDigits: 0 173 | Hex: 0 174 | HexMinDigits: 0 175 | JavaScriptQuotes: Leave 176 | JavaScriptWrapImports: true 177 | KeepEmptyLines: 178 | AtEndOfFile: false 179 | AtStartOfBlock: false 180 | AtStartOfFile: true 181 | LambdaBodyIndentation: Signature 182 | LineEnding: DeriveLF 183 | MacroBlockBegin: "" 184 | MacroBlockEnd: "" 185 | MainIncludeChar: Quote 186 | MaxEmptyLinesToKeep: 1 187 | NamespaceIndentation: None 188 | ObjCBinPackProtocolList: Never 189 | ObjCBlockIndentWidth: 2 190 | ObjCBreakBeforeNestedBlockParam: true 191 | ObjCSpaceAfterProperty: false 192 | ObjCSpaceBeforeProtocolList: true 193 | PPIndentWidth: -1 194 | PackConstructorInitializers: NextLine 195 | PenaltyBreakAssignment: 2 196 | PenaltyBreakBeforeFirstCallParameter: 1 197 | PenaltyBreakComment: 300 198 | PenaltyBreakFirstLessLess: 120 199 | PenaltyBreakOpenParenthesis: 0 200 | PenaltyBreakScopeResolution: 500 201 | PenaltyBreakString: 1000 202 | PenaltyBreakTemplateDeclaration: 10 203 | PenaltyExcessCharacter: 1000000 204 | PenaltyIndentedWhitespace: 0 205 | PenaltyReturnTypeOnItsOwnLine: 200 206 | PointerAlignment: Left 207 | QualifierAlignment: Leave 208 | RawStringFormats: 209 | - Language: Cpp 210 | Delimiters: 211 | - cc 212 | - CC 213 | - cpp 214 | - Cpp 215 | - CPP 216 | - c++ 217 | - C++ 218 | CanonicalDelimiter: "" 219 | BasedOnStyle: google 220 | - Language: TextProto 221 | Delimiters: 222 | - pb 223 | - PB 224 | - proto 225 | - PROTO 226 | EnclosingFunctions: 227 | - EqualsProto 228 | - EquivToProto 229 | - PARSE_PARTIAL_TEXT_PROTO 230 | - PARSE_TEST_PROTO 231 | - PARSE_TEXT_PROTO 232 | - ParseTextOrDie 233 | - ParseTextProtoOrDie 234 | - ParseTestProto 235 | - ParsePartialTestProto 236 | CanonicalDelimiter: pb 237 | BasedOnStyle: google 238 | ReferenceAlignment: Pointer 239 | ReflowComments: true 240 | RemoveBracesLLVM: false 241 | RemoveParentheses: Leave 242 | RemoveSemicolon: false 243 | RequiresClausePosition: OwnLine 244 | RequiresExpressionIndentation: OuterScope 245 | SeparateDefinitionBlocks: Always 246 | ShortNamespaceLines: 1 247 | SkipMacroDefinitionBody: false 248 | SortIncludes: CaseSensitive 249 | SortJavaStaticImport: Before 250 | SortUsingDeclarations: LexicographicNumeric 251 | SpaceAfterCStyleCast: false 252 | SpaceAfterLogicalNot: false 253 | SpaceAfterTemplateKeyword: true 254 | SpaceAroundPointerQualifiers: Default 255 | SpaceBeforeAssignmentOperators: true 256 | SpaceBeforeCaseColon: false 257 | SpaceBeforeCpp11BracedList: false 258 | SpaceBeforeCtorInitializerColon: true 259 | SpaceBeforeInheritanceColon: true 260 | SpaceBeforeJsonColon: false 261 | SpaceBeforeParens: ControlStatements 262 | SpaceBeforeParensOptions: 263 | AfterControlStatements: true 264 | AfterForeachMacros: true 265 | AfterFunctionDeclarationName: false 266 | AfterFunctionDefinitionName: false 267 | AfterIfMacros: true 268 | AfterOverloadedOperator: false 269 | AfterPlacementOperator: true 270 | AfterRequiresInClause: false 271 | AfterRequiresInExpression: false 272 | BeforeNonEmptyParentheses: false 273 | SpaceBeforeRangeBasedForLoopColon: true 274 | SpaceBeforeSquareBrackets: false 275 | SpaceInEmptyBlock: false 276 | SpacesBeforeTrailingComments: 1 277 | SpacesInAngles: Never 278 | SpacesInContainerLiterals: true 279 | SpacesInLineCommentPrefix: 280 | Minimum: 1 281 | Maximum: -1 282 | SpacesInParens: Never 283 | SpacesInParensOptions: 284 | ExceptDoubleParentheses: false 285 | InConditionalStatements: false 286 | InCStyleCasts: false 287 | InEmptyParentheses: false 288 | Other: false 289 | SpacesInSquareBrackets: false 290 | Standard: Auto 291 | StatementAttributeLikeMacros: 292 | - M256_ALIGN 293 | StatementMacros: 294 | - BLABLA 295 | TabWidth: 4 296 | TableGenBreakInsideDAGArg: DontBreak 297 | UseTab: Never 298 | VerilogBreakBetweenInstancePorts: true 299 | WhitespaceSensitiveMacros: 300 | - BLABLA 301 | AlwaysBreakAfterReturnType: None 302 | AlwaysBreakTemplateDeclarations: Yes 303 | KeepEmptyLinesAtTheStartOfBlocks: false 304 | Language: Cpp 305 | SpaceInEmptyParentheses: false 306 | SpacesInCStyleCastParentheses: false 307 | SpacesInConditionalStatement: false 308 | SpacesInParentheses: false 309 | TypenameMacros: 310 | - BLABLA 311 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.22...3.30) 2 | 3 | include(FetchContent) 4 | 5 | # Arm64? 6 | if((CMAKE_GENERATOR_PLATFORM MATCHES "ARM64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")) 7 | set(IS_ARM64 TRUE) 8 | else() 9 | set(IS_ARM64 FALSE) 10 | endif() 11 | 12 | # Download sse2neon for ARM 13 | if(IS_ARM64) 14 | FetchContent_Declare( 15 | sse2neon 16 | GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git 17 | GIT_TAG master 18 | GIT_SHALLOW 1 19 | ) 20 | 21 | message("MathLib: Downloading sse2neon...") 22 | FetchContent_MakeAvailable(sse2neon) 23 | endif() 24 | 25 | # Sources 26 | file(GLOB MATHLIB_H 27 | "ml.h" 28 | "ml.hlsli" 29 | ) 30 | source_group("MathLib" FILES ${MATHLIB_H}) 31 | 32 | file(GLOB MATHLIB_GUTS "Guts/*") 33 | source_group("MathLib/Guts" FILES ${MATHLIB_GUTS}) 34 | 35 | set_property(SOURCE "ml.hlsli" PROPERTY VS_SETTINGS "ExcludedFromBuild=true") 36 | 37 | # Library 38 | add_library(MathLib INTERFACE) 39 | 40 | if(WIN32) # TODO: is MSVC? 41 | target_sources(MathLib PRIVATE ${MATHLIB_H} ${MATHLIB_GUTS}) 42 | set_target_properties(MathLib PROPERTIES FOLDER "MathLib") 43 | endif() 44 | 45 | if(IS_ARM64 AND MSVC) 46 | target_compile_options(MathLib INTERFACE 47 | # ARM64 builds require modern preprocessor 48 | /Zc:preprocessor 49 | # Suppress known warnings: 50 | /wd4310 # cast truncates constant value 51 | /wd4127 # conditional expression is constant 52 | ) 53 | endif() 54 | 55 | target_include_directories(MathLib INTERFACE 56 | . 57 | $<$:${sse2neon_SOURCE_DIR}> 58 | ) 59 | -------------------------------------------------------------------------------- /Guts/bool1.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | struct bool2 { 6 | int32_t mask; 7 | 8 | public: 9 | ML_INLINE bool2(int32_t m) 10 | : mask(m) { 11 | } 12 | 13 | ML_INLINE operator int2() const; 14 | ML_INLINE operator uint2() const; 15 | ML_INLINE operator float2() const; 16 | ML_INLINE operator double2() const; 17 | }; 18 | 19 | struct bool3 { 20 | int32_t mask; 21 | 22 | public: 23 | ML_INLINE bool3(int32_t m) 24 | : mask(m) { 25 | } 26 | 27 | ML_INLINE operator int3() const; 28 | ML_INLINE operator uint3() const; 29 | ML_INLINE operator float3() const; 30 | ML_INLINE operator double3() const; 31 | }; 32 | 33 | struct bool4 { 34 | int32_t mask; 35 | 36 | public: 37 | ML_INLINE bool4(int32_t m) 38 | : mask(m) { 39 | } 40 | 41 | ML_INLINE operator int4() const; 42 | ML_INLINE operator uint4() const; 43 | ML_INLINE operator float4() const; 44 | ML_INLINE operator double4() const; 45 | }; 46 | 47 | ML_INLINE bool all(bool b) { 48 | return b; 49 | } 50 | 51 | ML_INLINE bool all(bool2 b) { 52 | return (b.mask & ML_Mask(1, 1, 0, 0)) == ML_Mask(1, 1, 0, 0); 53 | } 54 | 55 | ML_INLINE bool all(bool3 b) { 56 | return (b.mask & ML_Mask(1, 1, 1, 0)) == ML_Mask(1, 1, 1, 0); 57 | } 58 | 59 | ML_INLINE bool all(bool4 b) { 60 | return (b.mask & ML_Mask(1, 1, 1, 1)) == ML_Mask(1, 1, 1, 1); 61 | } 62 | 63 | ML_INLINE bool any(bool b) { 64 | return b; 65 | } 66 | 67 | ML_INLINE bool any(bool2 b) { 68 | return (b.mask & ML_Mask(1, 1, 0, 0)) != 0; 69 | } 70 | 71 | ML_INLINE bool any(bool3 b) { 72 | return (b.mask & ML_Mask(1, 1, 1, 0)) != 0; 73 | } 74 | 75 | ML_INLINE bool any(bool4 b) { 76 | return (b.mask & ML_Mask(1, 1, 1, 1)) != 0; 77 | } 78 | -------------------------------------------------------------------------------- /Guts/conversion.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | // asfloat 6 | ML_INLINE float asfloat(uint32_t x) { 7 | return *(float*)&x; 8 | } 9 | 10 | ML_INLINE float2 asfloat(const uint2& x) { 11 | return float2(asfloat(x.x), asfloat(x.y)); 12 | } 13 | 14 | ML_INLINE float4 asfloat(const uint4& x) { 15 | return _mm_castsi128_ps(x.xmm); 16 | } 17 | 18 | // asuint 19 | ML_INLINE uint32_t asuint(float x) { 20 | return *(uint32_t*)&x; 21 | } 22 | 23 | ML_INLINE uint2 asuint(const float2& x) { 24 | return uint2(asuint(x.x), asuint(x.y)); 25 | } 26 | 27 | ML_INLINE uint4 asuint(const float4& x) { 28 | return _mm_castps_si128(x.xmm); 29 | } 30 | 31 | // From bool2 32 | ML_INLINE bool2::operator int2() const { 33 | return int2((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0); 34 | } 35 | 36 | ML_INLINE bool2::operator uint2() const { 37 | return uint2((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0); 38 | } 39 | 40 | ML_INLINE bool2::operator float2() const { 41 | return float2((mask & 0x1) ? 1.0f : 0.0f, (mask & 0x2) ? 1.0f : 0.0f); 42 | } 43 | 44 | ML_INLINE bool2::operator double2() const { 45 | return double2((mask & 0x1) ? 1.0 : 0.0, (mask & 0x2) ? 1.0 : 0.0); 46 | } 47 | 48 | // From bool3 49 | ML_INLINE bool3::operator int3() const { 50 | return int3((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0); 51 | } 52 | 53 | ML_INLINE bool3::operator uint3() const { 54 | return uint3((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0); 55 | } 56 | 57 | ML_INLINE bool3::operator float3() const { 58 | return float3((mask & 0x1) ? 1.0f : 0.0f, (mask & 0x2) ? 1.0f : 0.0f, (mask & 0x4) ? 1.0f : 0.0f); 59 | } 60 | 61 | ML_INLINE bool3::operator double3() const { 62 | return double3((mask & 0x1) ? 1.0 : 0.0, (mask & 0x2) ? 1.0 : 0.0, (mask & 0x4) ? 1.0 : 0.0); 63 | } 64 | 65 | // From bool4 66 | ML_INLINE bool4::operator int4() const { 67 | return int4((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0, (mask & 0x8) ? 1 : 0); 68 | } 69 | 70 | ML_INLINE bool4::operator uint4() const { 71 | return uint4((mask & 0x1) ? 1 : 0, (mask & 0x2) ? 1 : 0, (mask & 0x4) ? 1 : 0, (mask & 0x8) ? 1 : 0); 72 | } 73 | 74 | ML_INLINE bool4::operator float4() const { 75 | return float4((mask & 0x1) ? 1.0f : 0.0f, (mask & 0x2) ? 1.0f : 0.0f, (mask & 0x4) ? 1.0f : 0.0f, (mask & 0x8) ? 1.0f : 0.0f); 76 | } 77 | 78 | ML_INLINE bool4::operator double4() const { 79 | return double4((mask & 0x1) ? 1.0 : 0.0, (mask & 0x2) ? 1.0 : 0.0, (mask & 0x4) ? 1.0 : 0.0, (mask & 0x8) ? 1.0 : 0.0); 80 | } 81 | 82 | // From int2 83 | ML_INLINE int2::operator uint2() const { 84 | return uint2((uint)x, (uint)y); 85 | } 86 | 87 | ML_INLINE int2::operator float2() const { 88 | return float2((float)x, (float)y); 89 | } 90 | 91 | ML_INLINE int2::operator double2() const { 92 | return double2((double)x, (double)y); 93 | } 94 | 95 | // From uint2 96 | ML_INLINE uint2::operator int2() const { 97 | return int2((int32_t)x, (int32_t)y); 98 | } 99 | 100 | ML_INLINE uint2::operator float2() const { 101 | return float2((float)x, (float)y); 102 | } 103 | 104 | ML_INLINE uint2::operator double2() const { 105 | return double2((double)x, (double)y); 106 | } 107 | 108 | // From float2 109 | ML_INLINE float2::operator int2() const { 110 | return int2((int32_t)x, (int32_t)y); 111 | } 112 | 113 | ML_INLINE float2::operator uint2() const { 114 | return uint2((uint)x, (uint)y); 115 | } 116 | 117 | ML_INLINE float2::operator double2() const { 118 | return double2((double)x, (double)y); 119 | } 120 | 121 | // From double2 122 | ML_INLINE double2::operator int2() const { 123 | return int2((int32_t)x, (int32_t)y); 124 | } 125 | 126 | ML_INLINE double2::operator uint2() const { 127 | return uint2((uint)x, (uint)y); 128 | } 129 | 130 | ML_INLINE double2::operator float2() const { 131 | return float2((float)x, (float)y); 132 | } 133 | 134 | // From int3 135 | ML_INLINE int3::operator uint3() const { 136 | return xmm; 137 | } 138 | 139 | ML_INLINE int3::operator float3() const { 140 | return _mm_cvtepi32_ps(xmm); 141 | } 142 | 143 | ML_INLINE int3::operator double3() const { 144 | return _mm256_cvtepi32_pd(xmm); 145 | } 146 | 147 | // From uint3 148 | ML_INLINE uint3::operator int3() const { 149 | return xmm; 150 | } 151 | 152 | ML_INLINE uint3::operator float3() const { 153 | return _mm_cvtepi32_ps(xmm); 154 | } 155 | 156 | ML_INLINE uint3::operator double3() const { 157 | return _mm256_cvtepi32_pd(xmm); 158 | } 159 | 160 | // From float3 161 | ML_INLINE float3::operator int3() const { 162 | return _mm_cvtps_epi32(xmm); 163 | } 164 | 165 | ML_INLINE float3::operator uint3() const { 166 | return _mm_cvtps_epi32(xmm); 167 | } 168 | 169 | ML_INLINE float3::operator double3() const { 170 | return _mm256_cvtps_pd(xmm); 171 | } 172 | 173 | // From double3 174 | ML_INLINE double3::operator int3() const { 175 | return _mm256_cvtpd_epi32(ymm); 176 | } 177 | 178 | ML_INLINE double3::operator uint3() const { 179 | return _mm256_cvtpd_epi32(ymm); 180 | } 181 | 182 | ML_INLINE double3::operator float3() const { 183 | return _mm256_cvtpd_ps(ymm); 184 | } 185 | 186 | // From int4 187 | ML_INLINE int4::operator uint4() const { 188 | return xmm; 189 | } 190 | 191 | ML_INLINE int4::operator float4() const { 192 | return _mm_cvtepi32_ps(xmm); 193 | } 194 | 195 | ML_INLINE int4::operator double4() const { 196 | return _mm256_cvtepi32_pd(xmm); 197 | } 198 | 199 | // From uint4 200 | ML_INLINE uint4::operator int4() const { 201 | return xmm; 202 | } 203 | 204 | ML_INLINE uint4::operator float4() const { 205 | return _mm_cvtepi32_ps(xmm); 206 | } 207 | 208 | ML_INLINE uint4::operator double4() const { 209 | return _mm256_cvtepi32_pd(xmm); 210 | } 211 | 212 | // From float4 213 | ML_INLINE float4::operator int4() const { 214 | return _mm_cvtps_epi32(xmm); 215 | } 216 | 217 | ML_INLINE float4::operator uint4() const { 218 | return _mm_cvtps_epi32(xmm); 219 | } 220 | 221 | ML_INLINE float4::operator double4() const { 222 | return _mm256_cvtps_pd(xmm); 223 | } 224 | 225 | // From double4 226 | ML_INLINE double4::operator int4() const { 227 | return _mm256_cvtpd_epi32(ymm); 228 | } 229 | 230 | ML_INLINE double4::operator uint4() const { 231 | return _mm256_cvtpd_epi32(ymm); 232 | } 233 | 234 | ML_INLINE double4::operator float4() const { 235 | return _mm256_cvtpd_ps(ymm); 236 | } 237 | 238 | // From float4x4 239 | ML_INLINE float4x4::operator double4x4() const { 240 | double4x4 r; 241 | r.ca[0] = _mm256_cvtps_pd(ca[0]); 242 | r.ca[1] = _mm256_cvtps_pd(ca[1]); 243 | r.ca[2] = _mm256_cvtps_pd(ca[2]); 244 | r.ca[3] = _mm256_cvtps_pd(ca[3]); 245 | 246 | return r; 247 | } 248 | 249 | // From double4x4 250 | ML_INLINE double4x4::operator float4x4() const { 251 | float4x4 r; 252 | r.ca[0] = _mm256_cvtpd_ps(ca[0]); 253 | r.ca[1] = _mm256_cvtpd_ps(ca[1]); 254 | r.ca[2] = _mm256_cvtpd_ps(ca[2]); 255 | r.ca[3] = _mm256_cvtpd_ps(ca[3]); 256 | 257 | return r; 258 | } 259 | -------------------------------------------------------------------------------- /Guts/f16.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | #define F16_M_BITS 10 6 | #define F16_E_BITS 5 7 | #define F16_S_MASK 0x8000 8 | 9 | template 10 | ML_INLINE uint32_t ToSmallFloat(float x) { 11 | const int32_t E_MASK = (1 << E_BITS) - 1; 12 | const uint32_t INF = uint32_t(E_MASK) << uint32_t(M_BITS); 13 | const int32_t BIAS = E_MASK >> 1; 14 | const int32_t ROUND = 1 << (23 - M_BITS - 1); 15 | 16 | // decompose float 17 | uint32_t f32 = *(uint32_t*)&x; 18 | uint32_t packed = (f32 >> 16) & S_MASK; 19 | int32_t e = ((f32 >> 23) & 0xFF) - 127 + BIAS; 20 | int32_t m = f32 & 0x007FFFFF; 21 | 22 | if (e == 128 + BIAS) { 23 | // Inf 24 | packed |= INF; 25 | 26 | if (m) { 27 | // NaN 28 | m >>= 23 - M_BITS; 29 | packed |= m | (m == 0); 30 | } 31 | } else if (e > 0) { 32 | // round to nearest, round "0.5" up 33 | if (m & ROUND) { 34 | m += ROUND << 1; 35 | 36 | if (m & 0x00800000) { 37 | // mantissa overflow 38 | m = 0; 39 | e++; 40 | } 41 | } 42 | 43 | if (e >= E_MASK) { 44 | // exponent overflow - flush to Inf 45 | packed |= INF; 46 | } else { 47 | // representable value 48 | m >>= 23 - M_BITS; 49 | packed |= (e << M_BITS) | m; 50 | } 51 | } else { 52 | // denormalized or zero 53 | m = ((m | 0x00800000) >> (1 - e)) + ROUND; 54 | m >>= 23 - M_BITS; 55 | packed |= m; 56 | } 57 | 58 | return packed; 59 | } 60 | 61 | template 62 | ML_INLINE float FromSmallFloat(uint32_t x) { 63 | const uint32_t E_MASK = (1 << E_BITS) - 1; 64 | const int32_t BIAS = E_MASK >> 1; 65 | const float DENORM_SCALE = 1.0f / (1 << (14 + M_BITS)); 66 | const float NORM_SCALE = 1.0f / float(1 << M_BITS); 67 | 68 | int32_t s = (x & S_MASK) << 15; 69 | int32_t e = (x >> M_BITS) & E_MASK; 70 | int32_t m = x & ((1 << M_BITS) - 1); 71 | 72 | uFloat f; 73 | if (e == 0) 74 | f.f = DENORM_SCALE * m; 75 | else if (e == E_MASK) 76 | f.i = s | 0x7F800000 | (m << (23 - M_BITS)); 77 | else { 78 | f.f = 1.0f + float(m) * NORM_SCALE; 79 | 80 | if (e < BIAS) 81 | f.f /= float(1 << (BIAS - e)); 82 | else 83 | f.f *= float(1 << (e - BIAS)); 84 | } 85 | 86 | if (s) 87 | f.f = -f.f; 88 | 89 | return f.f; 90 | } 91 | 92 | ML_INLINE uint32_t f32tof16(float x) { 93 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 94 | v4f v = v4f_set(x, 0.0f, 0.0f, 0.0f); 95 | v4i p = v4f_to_h4(v); 96 | 97 | uint32_t r = _mm_cvtsi128_si32(p); 98 | #else 99 | uint32_t r = ToSmallFloat(x); 100 | #endif 101 | 102 | return r; 103 | } 104 | 105 | ML_INLINE float f16tof32(uint32_t x) { 106 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 107 | v4i p = _mm_cvtsi32_si128(x); 108 | v4f f = _mm_cvtph_ps(p); 109 | 110 | return _mm_cvtss_f32(f); 111 | #else 112 | return FromSmallFloat(x); 113 | #endif 114 | } 115 | 116 | #ifndef __ARM_NEON 117 | struct float16_t { 118 | uint16_t x; 119 | 120 | ML_INLINE float16_t(float v) { 121 | x = (uint16_t)f32tof16(v); 122 | } 123 | 124 | ML_INLINE operator float() const { 125 | return f16tof32(x); 126 | } 127 | 128 | ML_INLINE float16_t() = default; 129 | ML_INLINE float16_t(const float16_t&) = default; 130 | ML_INLINE float16_t& operator=(const float16_t&) = default; 131 | }; 132 | #endif 133 | 134 | struct float16_t2 { 135 | float16_t x, y; 136 | 137 | ML_INLINE float16_t2(const float16_t& x, const float16_t& y) 138 | : x(x), y(y) { 139 | } 140 | 141 | ML_INLINE float16_t2() = default; 142 | ML_INLINE float16_t2(const float16_t2&) = default; 143 | ML_INLINE float16_t2& operator=(const float16_t2&) = default; 144 | }; 145 | 146 | struct float16_t4 { 147 | float16_t x, y, z, w; 148 | 149 | ML_INLINE float16_t4(const float16_t& x, const float16_t& y, const float16_t& z, const float16_t& w) 150 | : x(x), y(y), z(z), w(w) { 151 | } 152 | 153 | ML_INLINE float16_t4(const float16_t2& xy, const float16_t2& zw) { 154 | *((float16_t2*)&x) = xy; 155 | *((float16_t2*)&z) = zw; 156 | } 157 | 158 | ML_INLINE float16_t4() = default; 159 | ML_INLINE float16_t4(const float16_t4&) = default; 160 | ML_INLINE float16_t4& operator=(const float16_t4&) = default; 161 | }; 162 | -------------------------------------------------------------------------------- /Guts/f32.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | //====================================================================================================================== 6 | // float2 7 | //====================================================================================================================== 8 | 9 | union float2 { 10 | v2i mm; 11 | 12 | struct { 13 | float a[COORD_2D]; 14 | }; 15 | 16 | struct { 17 | float x, y; 18 | }; 19 | 20 | ML_SWIZZLE_2(float2, float); 21 | 22 | public: 23 | ML_INLINE float2() 24 | : mm(0) { 25 | } 26 | 27 | ML_INLINE float2(float c) 28 | : x(c), y(c) { 29 | } 30 | 31 | ML_INLINE float2(float _x, float _y) 32 | : x(_x), y(_y) { 33 | } 34 | 35 | ML_INLINE float2(const float2& v) = default; 36 | 37 | // Set 38 | 39 | ML_INLINE void operator=(const float2& v) { 40 | mm = v.mm; 41 | } 42 | 43 | // Conversion 44 | 45 | ML_INLINE operator int2() const; 46 | ML_INLINE operator uint2() const; 47 | ML_INLINE operator double2() const; 48 | 49 | // Compare 50 | 51 | ML_COMPARE_UNOPT(bool2, float2, <) 52 | ML_COMPARE_UNOPT(bool2, float2, <=) 53 | ML_COMPARE_UNOPT(bool2, float2, ==) 54 | ML_COMPARE_UNOPT(bool2, float2, >=) 55 | ML_COMPARE_UNOPT(bool2, float2, >) 56 | ML_COMPARE_UNOPT(bool2, float2, !=) 57 | 58 | // Ops 59 | 60 | ML_INLINE float2 operator-() const { 61 | return float2(-x, -y); 62 | } 63 | 64 | ML_OP_UNOPT(float2, float, -, -=) 65 | ML_OP_UNOPT(float2, float, +, +=) 66 | ML_OP_UNOPT(float2, float, *, *=) 67 | ML_OP_UNOPT(float2, float, /, /=) 68 | }; 69 | 70 | ML_INLINE float2 degrees(const float2& x) { 71 | return x * (180.0f / acosf(-1.0f)); 72 | } 73 | 74 | ML_INLINE float2 radians(const float2& x) { 75 | return x * (acosf(-1.0f) / 180.0f); 76 | } 77 | 78 | ML_INLINE float2 sign(const float2& x) { 79 | return float2(sign(x.x), sign(x.y)); 80 | } 81 | 82 | ML_INLINE float2 abs(const float2& x) { 83 | return float2(abs(x.x), abs(x.y)); 84 | } 85 | 86 | ML_INLINE float2 floor(const float2& x) { 87 | return float2(floor(x.x), floor(x.y)); 88 | } 89 | 90 | ML_INLINE float2 round(const float2& x) { 91 | return float2(round(x.x), round(x.y)); 92 | } 93 | 94 | ML_INLINE float2 ceil(const float2& x) { 95 | return float2(ceil(x.x), ceil(x.y)); 96 | } 97 | 98 | ML_INLINE float2 frac(const float2& x) { 99 | return float2(frac(x.x), frac(x.y)); 100 | } 101 | 102 | ML_INLINE float2 fmod(const float2& x, const float2& y) { 103 | return float2(fmod(x.x, y.x), fmod(x.y, y.y)); 104 | } 105 | 106 | ML_INLINE float2 min(const float2& x, const float2& y) { 107 | return float2(min(x.x, y.x), min(x.y, y.y)); 108 | } 109 | 110 | ML_INLINE float2 max(const float2& x, const float2& y) { 111 | return float2(max(x.x, y.x), max(x.y, y.y)); 112 | } 113 | 114 | ML_INLINE float2 clamp(const float2& x, const float2& a, const float2& b) { 115 | return float2(clamp(x.x, a.x, b.x), clamp(x.y, a.y, b.y)); 116 | } 117 | 118 | ML_INLINE float2 saturate(const float2& x) { 119 | return float2(clamp(x.x, 0.0f, 1.0f), clamp(x.y, 0.0f, 1.0f)); 120 | } 121 | 122 | ML_INLINE float2 lerp(const float2& a, const float2& b, const float2& x) { 123 | return a + (b - a) * x; 124 | } 125 | 126 | ML_INLINE float2 linearstep(const float2& a, const float2& b, const float2& x) { 127 | return saturate((x - a) / (b - a)); 128 | } 129 | 130 | ML_INLINE float2 smoothstep(const float2& a, const float2& b, const float2& x) { 131 | float2 t = linearstep(a, b, x); 132 | 133 | return t * t * (3.0f - 2.0f * t); 134 | } 135 | 136 | ML_INLINE float2 step(const float2& edge, const float2& x) { 137 | return float2(step(edge.x, x.x), step(edge.y, x.y)); 138 | } 139 | 140 | ML_INLINE float2 sin(const float2& x) { 141 | return float2(sin(x.x), sin(x.y)); 142 | } 143 | 144 | ML_INLINE float2 cos(const float2& x) { 145 | return float2(cos(x.x), cos(x.y)); 146 | } 147 | 148 | ML_INLINE float2 tan(const float2& x) { 149 | return float2(tan(x.x), tan(x.y)); 150 | } 151 | 152 | ML_INLINE float2 asin(const float2& x) { 153 | return float2(asin(x.x), asin(x.y)); 154 | } 155 | 156 | ML_INLINE float2 acos(const float2& x) { 157 | return float2(acos(x.x), acos(x.y)); 158 | } 159 | 160 | ML_INLINE float2 atan(const float2& x) { 161 | return float2(atan(x.x), atan(x.y)); 162 | } 163 | 164 | ML_INLINE float2 atan2(const float2& y, const float2& x) { 165 | return float2(atan2(y.x, x.x), atan2(y.y, x.y)); 166 | } 167 | 168 | ML_INLINE float2 sqrt(const float2& x) { 169 | return float2(sqrt(x.x), sqrt(x.y)); 170 | } 171 | 172 | ML_INLINE float2 rsqrt(const float2& x) { 173 | return float2(rsqrt(x.x), rsqrt(x.y)); 174 | } 175 | 176 | ML_INLINE float2 rcp(const float2& x) { 177 | return float2(rcp(x.x), rcp(x.y)); 178 | } 179 | 180 | ML_INLINE float2 pow(const float2& x, const float2& y) { 181 | return float2(pow(x.x, y.x), pow(x.y, y.y)); 182 | } 183 | 184 | ML_INLINE float2 log(const float2& x) { 185 | return float2(log(x.x), log(x.y)); 186 | } 187 | 188 | ML_INLINE float2 log2(const float2& x) { 189 | return float2(log2(x.x), log2(x.y)); 190 | } 191 | 192 | ML_INLINE float2 exp(const float2& x) { 193 | return float2(exp(x.x), exp(x.y)); 194 | } 195 | 196 | ML_INLINE float2 exp2(const float2& x) { 197 | return float2(exp2(x.x), exp2(x.y)); 198 | } 199 | 200 | ML_INLINE float2 madd(const float2& a, const float2& b, const float2& c) { 201 | return a * b + c; 202 | } 203 | 204 | ML_INLINE float dot(const float2& a, const float2& b) { 205 | return a.x * b.x + a.y * b.y; 206 | } 207 | 208 | ML_INLINE float length(const float2& x) { 209 | return sqrt(dot(x, x)); 210 | } 211 | 212 | ML_INLINE float2 normalize(const float2& x) { 213 | return x / length(x); 214 | } 215 | 216 | // non-HLSL 217 | 218 | ML_INLINE float2 Pi(const float2& mul) { 219 | return mul * acosf(-1.0f); 220 | } 221 | 222 | ML_INLINE float2 GetPerpendicularVector(const float2& a) { 223 | return float2(-a.y, a.x); 224 | } 225 | 226 | ML_INLINE float2 Snap(const float2& x, const float2& step) { 227 | return round(x / step) * step; 228 | } 229 | 230 | ML_INLINE float2 Rotate(const float2& v, float angle) { 231 | float sa = sin(angle); 232 | float ca = cos(angle); 233 | 234 | float2 p; 235 | p.x = ca * v.x + sa * v.y; 236 | p.y = ca * v.y - sa * v.x; 237 | 238 | return p; 239 | } 240 | 241 | //====================================================================================================================== 242 | // float3 243 | //====================================================================================================================== 244 | 245 | union float3 { 246 | v4f xmm; 247 | 248 | struct { 249 | float a[COORD_3D]; 250 | }; 251 | 252 | struct { 253 | float x, y, z; 254 | }; 255 | 256 | ML_SWIZZLE_3(v4f_swizzle2, float2, v4f_swizzle3, float3); 257 | 258 | public: 259 | ML_INLINE float3() 260 | : xmm(_mm_setzero_ps()) { 261 | } 262 | 263 | ML_INLINE float3(float c) 264 | : xmm(_mm_set1_ps(c)) { 265 | } 266 | 267 | ML_INLINE float3(float _x, float _y, float _z) 268 | : xmm(v4f_set(_x, _y, _z, 0.0f)) { 269 | } 270 | 271 | ML_INLINE float3(const float2& v, float _z) 272 | : xmm(v4f_set(v.x, v.y, _z, 0.0f)) { 273 | } 274 | 275 | ML_INLINE float3(float _x, const float2& v) 276 | : xmm(v4f_set(_x, v.x, v.y, 0.0f)) { 277 | } 278 | 279 | ML_INLINE float3(const v4f& v) 280 | : xmm(v) { 281 | } 282 | 283 | ML_INLINE float3(const float* v3) 284 | : xmm(v4f_set(v3[0], v3[1], v3[2], 0.0f)) { 285 | } 286 | 287 | ML_INLINE float3(const float3& v) = default; 288 | 289 | // Set 290 | 291 | ML_INLINE void operator=(const float3& v) { 292 | xmm = v.xmm; 293 | } 294 | 295 | // Conversion 296 | 297 | ML_INLINE operator int3() const; 298 | ML_INLINE operator uint3() const; 299 | ML_INLINE operator double3() const; 300 | 301 | // Compare 302 | 303 | ML_COMPARE(bool3, float3, <, _mm_cmplt_ps, _mm_movemask_ps, xmm) 304 | ML_COMPARE(bool3, float3, <=, _mm_cmple_ps, _mm_movemask_ps, xmm) 305 | ML_COMPARE(bool3, float3, ==, _mm_cmpeq_ps, _mm_movemask_ps, xmm) 306 | ML_COMPARE(bool3, float3, >, _mm_cmpgt_ps, _mm_movemask_ps, xmm) 307 | ML_COMPARE(bool3, float3, >=, _mm_cmpge_ps, _mm_movemask_ps, xmm) 308 | ML_COMPARE(bool3, float3, !=, _mm_cmpneq_ps, _mm_movemask_ps, xmm) 309 | 310 | // Ops 311 | 312 | ML_INLINE float3 operator-() const { 313 | return v4f_negate(xmm); 314 | } 315 | 316 | ML_OP(float3, float, -, -=, _mm_sub_ps, _mm_set1_ps, xmm) 317 | ML_OP(float3, float, +, +=, _mm_add_ps, _mm_set1_ps, xmm) 318 | ML_OP(float3, float, *, *=, _mm_mul_ps, _mm_set1_ps, xmm) 319 | ML_OP(float3, float, /, /=, _mm_div_ps, _mm_set1_ps, xmm) 320 | 321 | // Misc 322 | 323 | ML_INLINE operator v4f() const { 324 | return xmm; 325 | } 326 | 327 | static ML_INLINE float3 Zero() { 328 | return _mm_setzero_ps(); 329 | } 330 | }; 331 | 332 | ML_INLINE float3 degrees(const float3& x) { 333 | return x * (180.0f / acosf(-1.0f)); 334 | } 335 | 336 | ML_INLINE float3 radians(const float3& x) { 337 | return x * (acosf(-1.0f) / 180.0f); 338 | } 339 | 340 | ML_INLINE float3 sign(const float3& x) { 341 | return v4f_sign(x.xmm); 342 | } 343 | 344 | ML_INLINE float3 abs(const float3& x) { 345 | return v4f_abs(x.xmm); 346 | } 347 | 348 | ML_INLINE float3 floor(const float3& x) { 349 | return v4f_floor(x.xmm); 350 | } 351 | 352 | ML_INLINE float3 round(const float3& x) { 353 | return v4f_round(x.xmm); 354 | } 355 | 356 | ML_INLINE float3 ceil(const float3& x) { 357 | return v4f_ceil(x.xmm); 358 | } 359 | 360 | ML_INLINE float3 frac(const float3& x) { 361 | return v4f_frac(x.xmm); 362 | } 363 | 364 | ML_INLINE float3 fmod(const float3& x, const float3& y) { 365 | return v4f_mod(x.xmm, y.xmm); 366 | } 367 | 368 | ML_INLINE float3 min(const float3& x, const float3& y) { 369 | return _mm_min_ps(x.xmm, y.xmm); 370 | } 371 | 372 | ML_INLINE float3 max(const float3& x, const float3& y) { 373 | return _mm_max_ps(x.xmm, y.xmm); 374 | } 375 | 376 | ML_INLINE float3 clamp(const float3& x, const float3& a, const float3& b) { 377 | return v4f_clamp(x.xmm, a.xmm, b.xmm); 378 | } 379 | 380 | ML_INLINE float3 saturate(const float3& x) { 381 | return v4f_saturate(x.xmm); 382 | } 383 | 384 | ML_INLINE float3 lerp(const float3& a, const float3& b, const float3& x) { 385 | return v4f_mix(a.xmm, b.xmm, x.xmm); 386 | } 387 | 388 | ML_INLINE float3 linearstep(const float3& a, const float3& b, const float3& x) { 389 | return v4f_linearstep(a.xmm, b.xmm, x.xmm); 390 | } 391 | 392 | ML_INLINE float3 smoothstep(const float3& a, const float3& b, const float3& x) { 393 | return v4f_smoothstep(a.xmm, b.xmm, x.xmm); 394 | } 395 | 396 | ML_INLINE float3 step(const float3& edge, const float3& x) { 397 | return v4f_step(edge.xmm, x.xmm); 398 | } 399 | 400 | ML_INLINE float3 sin(const float3& x) { 401 | return _mm_sin_ps(x.xmm); 402 | } 403 | 404 | ML_INLINE float3 cos(const float3& x) { 405 | return _mm_cos_ps(x.xmm); 406 | } 407 | 408 | ML_INLINE float3 tan(const float3& x) { 409 | return _mm_tan_ps(x.xmm); 410 | } 411 | 412 | ML_INLINE float3 asin(const float3& x) { 413 | ML_Assert(all(x >= float3(-1.0f)) && all(x <= float3(1.0f))); 414 | 415 | return _mm_asin_ps(x.xmm); 416 | } 417 | 418 | ML_INLINE float3 acos(const float3& x) { 419 | ML_Assert(all(x >= float3(-1.0f)) && all(x <= float3(1.0f))); 420 | 421 | return _mm_acos_ps(x.xmm); 422 | } 423 | 424 | ML_INLINE float3 atan(const float3& x) { 425 | return _mm_atan_ps(x.xmm); 426 | } 427 | 428 | ML_INLINE float3 atan2(const float3& y, const float3& x) { 429 | return _mm_atan2_ps(y.xmm, x.xmm); 430 | } 431 | 432 | ML_INLINE float3 sqrt(const float3& x) { 433 | return _mm_sqrt_ps(x.xmm); 434 | } 435 | 436 | ML_INLINE float3 rsqrt(const float3& x) { 437 | return v4f_rsqrt(x.xmm); 438 | } 439 | 440 | ML_INLINE float3 rcp(const float3& x) { 441 | return v4f_rcp(v4f_setw1(x.xmm)); 442 | } 443 | 444 | ML_INLINE float3 pow(const float3& x, const float3& y) { 445 | return _mm_pow_ps(x.xmm, y.xmm); 446 | } 447 | 448 | ML_INLINE float3 log(const float3& x) { 449 | return _mm_log_ps(x.xmm); 450 | } 451 | 452 | ML_INLINE float3 log2(const float3& x) { 453 | return _mm_log2_ps(x.xmm); 454 | } 455 | 456 | ML_INLINE float3 exp(const float3& x) { 457 | return _mm_exp_ps(x.xmm); 458 | } 459 | 460 | ML_INLINE float3 exp2(const float3& x) { 461 | return _mm_exp2_ps(x.xmm); 462 | } 463 | 464 | ML_INLINE float3 madd(const float3& a, const float3& b, const float3& c) { 465 | return _mm_fmadd_ps(a.xmm, b.xmm, c.xmm); 466 | } 467 | 468 | ML_INLINE float dot(const float3& a, const float3& b) { 469 | v4f r = v4f_dot33(a.xmm, b.xmm); 470 | 471 | return _mm_cvtss_f32(r); 472 | } 473 | 474 | ML_INLINE float length(const float3& x) { 475 | v4f r = v4f_length(x.xmm); 476 | 477 | return _mm_cvtss_f32(r); 478 | } 479 | 480 | ML_INLINE float3 normalize(const float3& x) { 481 | return v4f_normalize(x.xmm); 482 | } 483 | 484 | ML_INLINE float3 cross(const float3& x, const float3& y) { 485 | return v4f_cross(x.xmm, y.xmm); 486 | } 487 | 488 | ML_INLINE float3 reflect(const float3& v, const float3& n) { 489 | // NOTE: slow 490 | // return v - n * dot(n, v) * 2; 491 | 492 | v4f dot0 = v4f_dot33(n.xmm, v.xmm); 493 | dot0 = _mm_mul_ps(dot0, _mm_set1_ps(2.0f)); 494 | 495 | return _mm_fnmadd_ps(n.xmm, dot0, v.xmm); 496 | } 497 | 498 | ML_INLINE float3 refract(const float3& v, const float3& n, float eta) { 499 | // NOTE: slow 500 | /* 501 | float dot = dot(v, n); 502 | float k = 1 - eta * eta * (1 - dot * dot); 503 | 504 | if( k < 0 ) 505 | return 0 506 | 507 | return v * eta - n * (eta * dot + Sqrt(k)); 508 | */ 509 | 510 | v4f eta0 = _mm_set1_ps(eta); 511 | v4f dot0 = v4f_dot33(n.xmm, v.xmm); 512 | v4f mul0 = _mm_mul_ps(eta0, eta0); 513 | v4f sub0 = _mm_fnmadd_ps(dot0, dot0, c_v4f_1111); 514 | v4f sub1 = _mm_fnmadd_ps(mul0, sub0, c_v4f_1111); 515 | 516 | if (v4f_isnegative4_all(sub1)) 517 | return _mm_setzero_ps(); 518 | 519 | v4f mul5 = _mm_mul_ps(eta0, v.xmm); 520 | v4f mul3 = _mm_mul_ps(eta0, dot0); 521 | v4f sqt0 = _mm_sqrt_ps(sub1); 522 | v4f add0 = _mm_add_ps(mul3, sqt0); 523 | 524 | return _mm_fnmadd_ps(add0, n.xmm, mul5); 525 | } 526 | 527 | // non-HLSL 528 | 529 | ML_INLINE float3 Pi(const float3& mul) { 530 | return mul * acosf(-1.0f); 531 | } 532 | 533 | ML_INLINE float3 GetPerpendicularVector(const float3& N) { 534 | float3 T = float3(N.z, -N.x, N.y); 535 | T -= N * dot(T, N); 536 | 537 | return normalize(T); 538 | } 539 | 540 | ML_INLINE float3 SinCos(const float3& x, float3* pCos) { 541 | return _mm_sincos_ps(&pCos->xmm, x.xmm); 542 | } 543 | 544 | ML_INLINE float3 Snap(const float3& x, const float3& step) { 545 | return round(x / step) * step; 546 | } 547 | 548 | ML_INLINE bool IsPointsNear(const float3& p1, const float3& p2, float eps) { 549 | v4f r = _mm_sub_ps(p1.xmm, p2.xmm); 550 | r = v4f_abs(r); 551 | r = _mm_cmple_ps(r, _mm_set1_ps(eps)); 552 | 553 | return v4f_test3_all(r); 554 | } 555 | 556 | //====================================================================================================================== 557 | // float4 558 | //====================================================================================================================== 559 | 560 | union float4 { 561 | v4f xmm; 562 | 563 | struct { 564 | float a[COORD_4D]; 565 | }; 566 | 567 | struct { 568 | float x, y, z, w; 569 | }; 570 | 571 | ML_SWIZZLE_4(v4f_swizzle2, float2, v4f_swizzle3, float3, v4f_swizzle4, float4); 572 | 573 | public: 574 | ML_INLINE float4() 575 | : xmm(_mm_setzero_ps()) { 576 | } 577 | 578 | ML_INLINE float4(float c) 579 | : xmm(_mm_set1_ps(c)) { 580 | } 581 | 582 | ML_INLINE float4(float _x, float _y, float _z, float _w) 583 | : xmm(v4f_set(_x, _y, _z, _w)) { 584 | } 585 | 586 | ML_INLINE float4(const float3& v, float _w) 587 | : xmm(v4f_set(v.x, v.y, v.z, _w)) { 588 | } 589 | 590 | ML_INLINE float4(const float2& a, const float2& b) 591 | : xmm(v4f_set(a.x, a.y, b.x, b.y)) { 592 | } 593 | 594 | ML_INLINE float4(float _x, const float3& v) 595 | : xmm(v4f_set(_x, v.x, v.y, v.z)) { 596 | } 597 | 598 | ML_INLINE float4(const v4f& v) 599 | : xmm(v) { 600 | } 601 | 602 | ML_INLINE float4(const float* v4) 603 | : xmm(_mm_loadu_ps(v4)) { 604 | } 605 | 606 | ML_INLINE float4(const float4& v) = default; 607 | 608 | // Set 609 | 610 | ML_INLINE void operator=(const float4& v) { 611 | xmm = v.xmm; 612 | } 613 | 614 | // Conversion 615 | 616 | ML_INLINE operator int4() const; 617 | ML_INLINE operator uint4() const; 618 | ML_INLINE operator double4() const; 619 | 620 | // Compare 621 | 622 | ML_COMPARE(bool4, float4, <, _mm_cmplt_ps, _mm_movemask_ps, xmm) 623 | ML_COMPARE(bool4, float4, <=, _mm_cmple_ps, _mm_movemask_ps, xmm) 624 | ML_COMPARE(bool4, float4, ==, _mm_cmpeq_ps, _mm_movemask_ps, xmm) 625 | ML_COMPARE(bool4, float4, >, _mm_cmpgt_ps, _mm_movemask_ps, xmm) 626 | ML_COMPARE(bool4, float4, >=, _mm_cmpge_ps, _mm_movemask_ps, xmm) 627 | ML_COMPARE(bool4, float4, !=, _mm_cmpneq_ps, _mm_movemask_ps, xmm) 628 | 629 | // Ops 630 | 631 | ML_INLINE float4 operator-() const { 632 | return v4f_negate(xmm); 633 | } 634 | 635 | ML_OP(float4, float, -, -=, _mm_sub_ps, _mm_set1_ps, xmm) 636 | ML_OP(float4, float, +, +=, _mm_add_ps, _mm_set1_ps, xmm) 637 | ML_OP(float4, float, *, *=, _mm_mul_ps, _mm_set1_ps, xmm) 638 | ML_OP(float4, float, /, /=, _mm_div_ps, _mm_set1_ps, xmm) 639 | 640 | // Misc 641 | 642 | ML_INLINE operator v4f() const { 643 | return xmm; 644 | } 645 | 646 | static ML_INLINE float4 Zero() { 647 | return _mm_setzero_ps(); 648 | } 649 | }; 650 | 651 | ML_INLINE float4 degrees(const float4& x) { 652 | return x * (180.0f / acosf(-1.0f)); 653 | } 654 | 655 | ML_INLINE float4 radians(const float4& x) { 656 | return x * (acosf(-1.0f) / 180.0f); 657 | } 658 | 659 | ML_INLINE float4 sign(const float4& x) { 660 | return v4f_sign(x.xmm); 661 | } 662 | 663 | ML_INLINE float4 abs(const float4& x) { 664 | return v4f_abs(x.xmm); 665 | } 666 | 667 | ML_INLINE float4 floor(const float4& x) { 668 | return v4f_floor(x.xmm); 669 | } 670 | 671 | ML_INLINE float4 round(const float4& x) { 672 | return v4f_round(x.xmm); 673 | } 674 | 675 | ML_INLINE float4 ceil(const float4& x) { 676 | return v4f_ceil(x.xmm); 677 | } 678 | 679 | ML_INLINE float4 frac(const float4& x) { 680 | return v4f_frac(x.xmm); 681 | } 682 | 683 | ML_INLINE float4 fmod(const float4& x, const float4& y) { 684 | return v4f_mod(x.xmm, y.xmm); 685 | } 686 | 687 | ML_INLINE float4 min(const float4& x, const float4& y) { 688 | return _mm_min_ps(x.xmm, y.xmm); 689 | } 690 | 691 | ML_INLINE float4 max(const float4& x, const float4& y) { 692 | return _mm_max_ps(x.xmm, y.xmm); 693 | } 694 | 695 | ML_INLINE float4 clamp(const float4& x, const float4& a, const float4& b) { 696 | return v4f_clamp(x.xmm, a.xmm, b.xmm); 697 | } 698 | 699 | ML_INLINE float4 saturate(const float4& x) { 700 | return v4f_saturate(x.xmm); 701 | } 702 | 703 | ML_INLINE float4 lerp(const float4& a, const float4& b, const float4& x) { 704 | return v4f_mix(a.xmm, b.xmm, x.xmm); 705 | } 706 | 707 | ML_INLINE float4 linearstep(const float4& a, const float4& b, const float4& x) { 708 | return v4f_linearstep(a.xmm, b.xmm, x.xmm); 709 | } 710 | 711 | ML_INLINE float4 smoothstep(const float4& a, const float4& b, const float4& x) { 712 | return v4f_smoothstep(a.xmm, b.xmm, x.xmm); 713 | } 714 | 715 | ML_INLINE float4 step(const float4& edge, const float4& x) { 716 | return v4f_step(edge.xmm, x.xmm); 717 | } 718 | 719 | ML_INLINE float4 sin(const float4& x) { 720 | return _mm_sin_ps(x.xmm); 721 | } 722 | 723 | ML_INLINE float4 cos(const float4& x) { 724 | return _mm_cos_ps(x.xmm); 725 | } 726 | 727 | ML_INLINE float4 tan(const float4& x) { 728 | return _mm_tan_ps(x.xmm); 729 | } 730 | 731 | ML_INLINE float4 asin(const float4& x) { 732 | ML_Assert(all(x >= float4(-1.0f)) && all(x <= float4(1.0f))); 733 | 734 | return _mm_asin_ps(x.xmm); 735 | } 736 | 737 | ML_INLINE float4 acos(const float4& x) { 738 | ML_Assert(all(x >= float4(-1.0f)) && all(x <= float4(1.0f))); 739 | 740 | return _mm_acos_ps(x.xmm); 741 | } 742 | 743 | ML_INLINE float4 atan(const float4& x) { 744 | return _mm_atan_ps(x.xmm); 745 | } 746 | 747 | ML_INLINE float4 atan2(const float4& y, const float4& x) { 748 | return _mm_atan2_ps(y.xmm, x.xmm); 749 | } 750 | 751 | ML_INLINE float4 sqrt(const float4& x) { 752 | return _mm_sqrt_ps(x.xmm); 753 | } 754 | 755 | ML_INLINE float4 rsqrt(const float4& x) { 756 | return v4f_rsqrt(x.xmm); 757 | } 758 | 759 | ML_INLINE float4 rcp(const float4& x) { 760 | return v4f_rcp(x.xmm); 761 | } 762 | 763 | ML_INLINE float4 pow(const float4& x, const float4& y) { 764 | return _mm_pow_ps(x.xmm, y.xmm); 765 | } 766 | 767 | ML_INLINE float4 log(const float4& x) { 768 | return _mm_log_ps(x.xmm); 769 | } 770 | 771 | ML_INLINE float4 log2(const float4& x) { 772 | return _mm_log2_ps(x.xmm); 773 | } 774 | 775 | ML_INLINE float4 exp(const float4& x) { 776 | return _mm_exp_ps(x.xmm); 777 | } 778 | 779 | ML_INLINE float4 exp2(const float4& x) { 780 | return _mm_exp2_ps(x.xmm); 781 | } 782 | 783 | ML_INLINE float4 madd(const float4& a, const float4& b, const float4& c) { 784 | return _mm_fmadd_ps(a.xmm, b.xmm, c.xmm); 785 | } 786 | 787 | ML_INLINE float dot(const float4& a, const float4& b) { 788 | v4f r = v4f_dot44(a.xmm, b.xmm); 789 | 790 | return _mm_cvtss_f32(r); 791 | } 792 | 793 | // Non-HLSL 794 | 795 | ML_INLINE float4 Pi(const float4& mul) { 796 | return mul * acosf(-1.0f); 797 | } 798 | 799 | ML_INLINE float Dot43(const float4& a, const float3& b) { 800 | v4f r = v4f_dot43(a.xmm, b.xmm); 801 | 802 | return _mm_cvtss_f32(r); 803 | } 804 | 805 | ML_INLINE float4 Snap(const float4& x, const float4& step) { 806 | return round(x / step) * step; 807 | } 808 | 809 | ML_INLINE float4 SinCos(const float4& x, float4* pCos) { 810 | return _mm_sincos_ps(&pCos->xmm, x.xmm); 811 | } 812 | 813 | // TODO: add "Quaternion" 814 | ML_INLINE float4 Slerp(const float4& a, const float4& b, float x) { 815 | ML_Assert(x >= 0.0f && x <= 1.0f); 816 | ML_Assert(abs(dot(a, a) - 1.0f) < 1e-5f); 817 | ML_Assert(abs(dot(b, b) - 1.0f) < 1e-5f); 818 | 819 | float4 r; 820 | 821 | float theta = dot(a, b); 822 | if (theta > 0.9995f) 823 | r = lerp(a, b, x); 824 | else { 825 | theta = acos(theta); 826 | 827 | float3 s = sin(theta * float3(1.0f, 1.0f - x, x)); 828 | float sn = 1.0f / s.x; 829 | float wa = s.y * sn; 830 | float wb = s.z * sn; 831 | 832 | r = a * wa + b * wb; 833 | } 834 | 835 | r *= rsqrt(dot(r, r)); 836 | 837 | return r; 838 | } 839 | 840 | //====================================================================================================================== 841 | // float4x4 842 | //====================================================================================================================== 843 | 844 | // IMPORTANT: store - "column-major", usage - "row-major" (vector is a column) 845 | union float4x4 { 846 | // Column array 847 | struct { 848 | float4 ca[COORD_4D]; 849 | 850 | /* 851 | TODO: at least older GCC version don't allow this: 852 | 853 | float4 c0; 854 | float4 c1; 855 | float4 c2; 856 | float4 c3; 857 | 858 | because of this errors: 859 | - member with constructor not allowed in anonymous aggregate 860 | - member with copy assignment operator not allowed in anonymous aggregate 861 | */ 862 | }; 863 | 864 | // Element array 865 | struct { 866 | float a[COORD_4D * COORD_4D]; 867 | }; 868 | 869 | // Elements aXY, where X - row, Y - column 870 | struct { 871 | float a00, a10, a20, a30; 872 | float a01, a11, a21, a31; 873 | float a02, a12, a22, a32; 874 | float a03, a13, a23, a33; 875 | }; 876 | 877 | public: 878 | ML_INLINE float4x4() { 879 | ca[0] = _mm_setzero_ps(); 880 | ca[1] = _mm_setzero_ps(); 881 | ca[2] = _mm_setzero_ps(); 882 | ca[3] = _mm_setzero_ps(); 883 | } 884 | 885 | ML_INLINE float4x4(float m00, float m01, float m02, float m03, float m10, float m11, float m12, float m13, float m20, float m21, float m22, float m23, float m30, float m31, 886 | float m32, float m33) { 887 | ca[0] = v4f_set(m00, m10, m20, m30); 888 | ca[1] = v4f_set(m01, m11, m21, m31); 889 | ca[2] = v4f_set(m02, m12, m22, m32); 890 | ca[3] = v4f_set(m03, m13, m23, m33); 891 | } 892 | 893 | ML_INLINE float4x4(const float4& c0, const float4& c1, const float4& c2, const float4& c3) { 894 | ca[0] = c0; 895 | ca[1] = c1; 896 | ca[2] = c2; 897 | ca[3] = c3; 898 | } 899 | 900 | ML_INLINE float4x4(const float4x4& m) = default; 901 | 902 | // Set 903 | 904 | ML_INLINE void operator=(const float4x4& m) { 905 | ca[0] = m.ca[0]; 906 | ca[1] = m.ca[1]; 907 | ca[2] = m.ca[2]; 908 | ca[3] = m.ca[3]; 909 | } 910 | 911 | // Conversion 912 | 913 | ML_INLINE operator double4x4() const; 914 | 915 | // Compare 916 | 917 | ML_INLINE bool operator==(const float4x4& m) const { 918 | return all(ca[0] == m.ca[0]) && all(ca[1] == m.ca[1]) && all(ca[2] == m.ca[2]) && all(ca[3] == m.ca[3]); 919 | } 920 | 921 | ML_INLINE bool operator!=(const float4x4& m) const { 922 | return any(ca[0] != m.ca[0]) || any(ca[1] != m.ca[1]) || any(ca[2] != m.ca[2]) || any(ca[3] != m.ca[3]); 923 | } 924 | 925 | // NOTE: * 926 | 927 | ML_INLINE float4x4 operator*(const float4x4& m) const { 928 | float4x4 r; 929 | 930 | v4f r1 = _mm_mul_ps(v4f_swizzle(m.ca[0], 0, 0, 0, 0), ca[0]); 931 | v4f r2 = _mm_mul_ps(v4f_swizzle(m.ca[1], 0, 0, 0, 0), ca[0]); 932 | 933 | r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[0], 1, 1, 1, 1), ca[1], r1); 934 | r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[1], 1, 1, 1, 1), ca[1], r2); 935 | r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[0], 2, 2, 2, 2), ca[2], r1); 936 | r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[1], 2, 2, 2, 2), ca[2], r2); 937 | r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[0], 3, 3, 3, 3), ca[3], r1); 938 | r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[1], 3, 3, 3, 3), ca[3], r2); 939 | 940 | r.ca[0] = r1; 941 | r.ca[1] = r2; 942 | 943 | r1 = _mm_mul_ps(v4f_swizzle(m.ca[2], 0, 0, 0, 0), ca[0]); 944 | r2 = _mm_mul_ps(v4f_swizzle(m.ca[3], 0, 0, 0, 0), ca[0]); 945 | 946 | r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[2], 1, 1, 1, 1), ca[1], r1); 947 | r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[3], 1, 1, 1, 1), ca[1], r2); 948 | r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[2], 2, 2, 2, 2), ca[2], r1); 949 | r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[3], 2, 2, 2, 2), ca[2], r2); 950 | r1 = _mm_fmadd_ps(v4f_swizzle(m.ca[2], 3, 3, 3, 3), ca[3], r1); 951 | r2 = _mm_fmadd_ps(v4f_swizzle(m.ca[3], 3, 3, 3, 3), ca[3], r2); 952 | 953 | r.ca[2] = r1; 954 | r.ca[3] = r2; 955 | 956 | return r; 957 | } 958 | 959 | ML_INLINE float4 operator*(const float4& v) const { 960 | v4f r = _mm_mul_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), ca[0]); 961 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), ca[1], r); 962 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), ca[2], r); 963 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 3, 3, 3, 3), ca[3], r); 964 | 965 | return r; 966 | } 967 | 968 | ML_INLINE float3 operator*(const float3& v) const { 969 | v4f r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), ca[0], ca[3]); 970 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), ca[1], r); 971 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), ca[2], r); 972 | 973 | return r; 974 | } 975 | 976 | // Columns and rows 977 | 978 | float4& Col(uint32_t column) { 979 | ML_Assert(column < COORD_4D); 980 | 981 | return ca[column]; 982 | } 983 | 984 | const float4& Col(uint32_t column) const { 985 | ML_Assert(column < COORD_4D); 986 | 987 | return ca[column]; 988 | } 989 | 990 | float4& operator[](uint32_t column) { 991 | return Col(column); 992 | } 993 | 994 | const float4& operator[](uint32_t column) const { 995 | return Col(column); 996 | } 997 | 998 | ML_INLINE float4 Row(uint32_t row) const { 999 | ML_Assert(row < COORD_4D); 1000 | 1001 | return float4(a[row], a[COORD_4D + row], a[COORD_4D * 2 + row], a[COORD_4D * 3 + row]); 1002 | } 1003 | 1004 | // NOTE: other 1005 | 1006 | static ML_INLINE float4x4 Identity() { 1007 | return float4x4(1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f); 1008 | } 1009 | 1010 | ML_INLINE float GetNdcDepth(float z) const { 1011 | float c = a22 * z + a23; 1012 | float d = a32 * z + a33; 1013 | 1014 | return c / d; 1015 | } 1016 | 1017 | ML_INLINE float3 GetRotationYPR() const { 1018 | float3 r; 1019 | r.x = atan2(-a01, a11); 1020 | r.y = asin(a21); 1021 | r.z = atan2(-a20, a22); 1022 | 1023 | return r; 1024 | } 1025 | 1026 | ML_INLINE float4 GetQuaternion() const { 1027 | float4 q; 1028 | float t; 1029 | 1030 | if (a22 < 0.0f) { 1031 | if (a00 > a11) { 1032 | t = 1.0f + a00 - a11 - a22; 1033 | q = float4(t, a10 + a01, a02 + a20, a21 - a12); 1034 | } else { 1035 | t = 1.0f - a00 + a11 - a22; 1036 | q = float4(a10 + a01, t, a21 + a12, a02 - a20); 1037 | } 1038 | } else { 1039 | if (a00 < -a11) { 1040 | t = 1.0f - a00 - a11 + a22; 1041 | q = float4(a02 + a20, a21 + a12, t, a10 - a01); 1042 | } else { 1043 | t = 1.0f + a00 + a11 + a22; 1044 | q = float4(a21 - a12, a02 - a20, a10 - a01, t); 1045 | } 1046 | } 1047 | 1048 | q *= 0.5f / sqrt(t); 1049 | 1050 | return q; 1051 | } 1052 | 1053 | ML_INLINE float3 GetScale() const { 1054 | float3 scale = float3(_mm_cvtss_f32(v4f_length(ca[0])), _mm_cvtss_f32(v4f_length(ca[1])), _mm_cvtss_f32(v4f_length(ca[2]))); 1055 | 1056 | return scale; 1057 | } 1058 | 1059 | ML_INLINE void SetTranslation(const float3& p) { 1060 | ca[3] = v4f_setw1(p.xmm); 1061 | } 1062 | 1063 | ML_INLINE void AddTranslation(const float3& p) { 1064 | ca[3] = _mm_add_ps(ca[3], v4f_setw0(p.xmm)); 1065 | } 1066 | 1067 | ML_INLINE void PreTranslation(const float3& p); 1068 | 1069 | ML_INLINE void AddScale(const float3& scale) { 1070 | ca[0] = _mm_mul_ps(ca[0], scale.xmm); 1071 | ca[1] = _mm_mul_ps(ca[1], scale.xmm); 1072 | ca[2] = _mm_mul_ps(ca[2], scale.xmm); 1073 | } 1074 | 1075 | ML_INLINE void WorldToView(uint32_t uiProjFlags = 0) { 1076 | /* 1077 | float4x4 rot; 1078 | rot.SetupByRotationX(c_fHalfPi); 1079 | *this = (*this) * rot; 1080 | InvertOrtho(); 1081 | */ 1082 | 1083 | Swap(ca[1], ca[2]); 1084 | 1085 | if ((uiProjFlags & PROJ_LEFT_HANDED) == 0) 1086 | ca[2] = v4f_negate(ca[2]); 1087 | 1088 | Transpose3x4(); 1089 | } 1090 | 1091 | ML_INLINE void ViewToWorld(uint32_t uiProjFlags = 0) { 1092 | Transpose3x4(); 1093 | 1094 | if ((uiProjFlags & PROJ_LEFT_HANDED) == 0) 1095 | ca[2] = v4f_negate(ca[2]); 1096 | 1097 | Swap(ca[1], ca[2]); 1098 | } 1099 | 1100 | ML_INLINE bool IsLeftHanded() const { 1101 | float3 v1 = cross(float3(ca[0]), float3(ca[1])); 1102 | 1103 | return dot(v1, float3(ca[2])) < 0.0f; 1104 | } 1105 | 1106 | ML_INLINE void TransposeTo(float4x4& m) const { 1107 | v4f xmm0 = v4f_Ax_Bx_Ay_By(ca[0], ca[1]); 1108 | v4f xmm1 = v4f_Ax_Bx_Ay_By(ca[2], ca[3]); 1109 | v4f xmm2 = v4f_Az_Bz_Aw_Bw(ca[0], ca[1]); 1110 | v4f xmm3 = v4f_Az_Bz_Aw_Bw(ca[2], ca[3]); 1111 | 1112 | m.ca[0] = v4f_Axy_Bxy(xmm0, xmm1); 1113 | m.ca[1] = v4f_Azw_Bzw(xmm1, xmm0); 1114 | m.ca[2] = v4f_Axy_Bxy(xmm2, xmm3); 1115 | m.ca[3] = v4f_Azw_Bzw(xmm3, xmm2); 1116 | } 1117 | 1118 | ML_INLINE void Transpose() { 1119 | v4f xmm0 = v4f_Ax_Bx_Ay_By(ca[0], ca[1]); 1120 | v4f xmm1 = v4f_Ax_Bx_Ay_By(ca[2], ca[3]); 1121 | v4f xmm2 = v4f_Az_Bz_Aw_Bw(ca[0], ca[1]); 1122 | v4f xmm3 = v4f_Az_Bz_Aw_Bw(ca[2], ca[3]); 1123 | 1124 | ca[0] = v4f_Axy_Bxy(xmm0, xmm1); 1125 | ca[1] = v4f_Azw_Bzw(xmm1, xmm0); 1126 | ca[2] = v4f_Axy_Bxy(xmm2, xmm3); 1127 | ca[3] = v4f_Azw_Bzw(xmm3, xmm2); 1128 | } 1129 | 1130 | ML_INLINE void Transpose3x4() { 1131 | v4f xmm0 = v4f_Ax_Bx_Ay_By(ca[0], ca[1]); 1132 | v4f xmm1 = v4f_Ax_Bx_Ay_By(ca[2], ca[3]); 1133 | v4f xmm2 = v4f_Az_Bz_Aw_Bw(ca[0], ca[1]); 1134 | v4f xmm3 = v4f_Az_Bz_Aw_Bw(ca[2], ca[3]); 1135 | 1136 | ca[0] = v4f_Axy_Bxy(xmm0, xmm1); 1137 | ca[1] = v4f_Azw_Bzw(xmm1, xmm0); 1138 | ca[2] = v4f_Axy_Bxy(xmm2, xmm3); 1139 | } 1140 | 1141 | ML_INLINE void Invert() { 1142 | // NOTE: http://forum.devmaster.net/t/sse-mat4-inverse/16799 1143 | 1144 | v4f Fac0; 1145 | { 1146 | v4f Swp0a = v4f_shuffle(ca[3], ca[2], 3, 3, 3, 3); 1147 | v4f Swp0b = v4f_shuffle(ca[3], ca[2], 2, 2, 2, 2); 1148 | 1149 | v4f Swp00 = v4f_shuffle(ca[2], ca[1], 2, 2, 2, 2); 1150 | v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2); 1151 | v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2); 1152 | v4f Swp03 = v4f_shuffle(ca[2], ca[1], 3, 3, 3, 3); 1153 | 1154 | v4f Mul00 = _mm_mul_ps(Swp00, Swp01); 1155 | 1156 | Fac0 = _mm_fnmadd_ps(Swp02, Swp03, Mul00); 1157 | } 1158 | 1159 | v4f Fac1; 1160 | { 1161 | v4f Swp0a = v4f_shuffle(ca[3], ca[2], 3, 3, 3, 3); 1162 | v4f Swp0b = v4f_shuffle(ca[3], ca[2], 1, 1, 1, 1); 1163 | 1164 | v4f Swp00 = v4f_shuffle(ca[2], ca[1], 1, 1, 1, 1); 1165 | v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2); 1166 | v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2); 1167 | v4f Swp03 = v4f_shuffle(ca[2], ca[1], 3, 3, 3, 3); 1168 | 1169 | v4f Mul00 = _mm_mul_ps(Swp00, Swp01); 1170 | 1171 | Fac1 = _mm_fnmadd_ps(Swp02, Swp03, Mul00); 1172 | } 1173 | 1174 | v4f Fac2; 1175 | { 1176 | v4f Swp0a = v4f_shuffle(ca[3], ca[2], 2, 2, 2, 2); 1177 | v4f Swp0b = v4f_shuffle(ca[3], ca[2], 1, 1, 1, 1); 1178 | 1179 | v4f Swp00 = v4f_shuffle(ca[2], ca[1], 1, 1, 1, 1); 1180 | v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2); 1181 | v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2); 1182 | v4f Swp03 = v4f_shuffle(ca[2], ca[1], 2, 2, 2, 2); 1183 | 1184 | v4f Mul00 = _mm_mul_ps(Swp00, Swp01); 1185 | 1186 | Fac2 = _mm_fnmadd_ps(Swp02, Swp03, Mul00); 1187 | } 1188 | 1189 | v4f Fac3; 1190 | { 1191 | v4f Swp0a = v4f_shuffle(ca[3], ca[2], 3, 3, 3, 3); 1192 | v4f Swp0b = v4f_shuffle(ca[3], ca[2], 0, 0, 0, 0); 1193 | 1194 | v4f Swp00 = v4f_shuffle(ca[2], ca[1], 0, 0, 0, 0); 1195 | v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2); 1196 | v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2); 1197 | v4f Swp03 = v4f_shuffle(ca[2], ca[1], 3, 3, 3, 3); 1198 | 1199 | v4f Mul00 = _mm_mul_ps(Swp00, Swp01); 1200 | 1201 | Fac3 = _mm_fnmadd_ps(Swp02, Swp03, Mul00); 1202 | } 1203 | 1204 | v4f Fac4; 1205 | { 1206 | v4f Swp0a = v4f_shuffle(ca[3], ca[2], 2, 2, 2, 2); 1207 | v4f Swp0b = v4f_shuffle(ca[3], ca[2], 0, 0, 0, 0); 1208 | 1209 | v4f Swp00 = v4f_shuffle(ca[2], ca[1], 0, 0, 0, 0); 1210 | v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2); 1211 | v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2); 1212 | v4f Swp03 = v4f_shuffle(ca[2], ca[1], 2, 2, 2, 2); 1213 | 1214 | v4f Mul00 = _mm_mul_ps(Swp00, Swp01); 1215 | 1216 | Fac4 = _mm_fnmadd_ps(Swp02, Swp03, Mul00); 1217 | } 1218 | 1219 | v4f Fac5; 1220 | { 1221 | v4f Swp0a = v4f_shuffle(ca[3], ca[2], 1, 1, 1, 1); 1222 | v4f Swp0b = v4f_shuffle(ca[3], ca[2], 0, 0, 0, 0); 1223 | 1224 | v4f Swp00 = v4f_shuffle(ca[2], ca[1], 0, 0, 0, 0); 1225 | v4f Swp01 = v4f_swizzle(Swp0a, 0, 0, 0, 2); 1226 | v4f Swp02 = v4f_swizzle(Swp0b, 0, 0, 0, 2); 1227 | v4f Swp03 = v4f_shuffle(ca[2], ca[1], 1, 1, 1, 1); 1228 | 1229 | v4f Mul00 = _mm_mul_ps(Swp00, Swp01); 1230 | 1231 | Fac5 = _mm_fnmadd_ps(Swp02, Swp03, Mul00); 1232 | } 1233 | 1234 | v4f SignA = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f); 1235 | v4f SignB = _mm_set_ps(-1.0f, 1.0f, -1.0f, 1.0f); 1236 | 1237 | v4f Temp0 = v4f_shuffle(ca[1], ca[0], 0, 0, 0, 0); 1238 | v4f Vec0 = v4f_swizzle(Temp0, 0, 2, 2, 2); 1239 | 1240 | v4f Temp1 = v4f_shuffle(ca[1], ca[0], 1, 1, 1, 1); 1241 | v4f Vec1 = v4f_swizzle(Temp1, 0, 2, 2, 2); 1242 | 1243 | v4f Temp2 = v4f_shuffle(ca[1], ca[0], 2, 2, 2, 2); 1244 | v4f Vec2 = v4f_swizzle(Temp2, 0, 2, 2, 2); 1245 | 1246 | v4f Temp3 = v4f_shuffle(ca[1], ca[0], 3, 3, 3, 3); 1247 | v4f Vec3 = v4f_swizzle(Temp3, 0, 2, 2, 2); 1248 | 1249 | v4f Mul0 = _mm_mul_ps(Vec1, Fac0); 1250 | v4f Mul1 = _mm_mul_ps(Vec0, Fac0); 1251 | v4f Mul2 = _mm_mul_ps(Vec0, Fac1); 1252 | v4f Mul3 = _mm_mul_ps(Vec0, Fac2); 1253 | 1254 | v4f Sub0 = _mm_fnmadd_ps(Vec2, Fac1, Mul0); 1255 | v4f Sub1 = _mm_fnmadd_ps(Vec2, Fac3, Mul1); 1256 | v4f Sub2 = _mm_fnmadd_ps(Vec1, Fac3, Mul2); 1257 | v4f Sub3 = _mm_fnmadd_ps(Vec1, Fac4, Mul3); 1258 | 1259 | v4f Add0 = _mm_fmadd_ps(Vec3, Fac2, Sub0); 1260 | v4f Add1 = _mm_fmadd_ps(Vec3, Fac4, Sub1); 1261 | v4f Add2 = _mm_fmadd_ps(Vec3, Fac5, Sub2); 1262 | v4f Add3 = _mm_fmadd_ps(Vec2, Fac5, Sub3); 1263 | 1264 | v4f Inv0 = _mm_mul_ps(SignB, Add0); 1265 | v4f Inv1 = _mm_mul_ps(SignA, Add1); 1266 | v4f Inv2 = _mm_mul_ps(SignB, Add2); 1267 | v4f Inv3 = _mm_mul_ps(SignA, Add3); 1268 | 1269 | v4f Row0 = v4f_shuffle(Inv0, Inv1, 0, 0, 0, 0); 1270 | v4f Row1 = v4f_shuffle(Inv2, Inv3, 0, 0, 0, 0); 1271 | v4f Row2 = v4f_shuffle(Row0, Row1, 0, 2, 0, 2); 1272 | 1273 | v4f Det0 = v4f_dot44(ca[0], Row2); 1274 | v4f Rcp0 = v4f_rcp(Det0); 1275 | 1276 | ca[0] = _mm_mul_ps(Inv0, Rcp0); 1277 | ca[1] = _mm_mul_ps(Inv1, Rcp0); 1278 | ca[2] = _mm_mul_ps(Inv2, Rcp0); 1279 | ca[3] = _mm_mul_ps(Inv3, Rcp0); 1280 | } 1281 | 1282 | ML_INLINE void InvertOrtho(); 1283 | 1284 | // NOTE: special sets 1285 | 1286 | ML_INLINE void SetupByQuaternion(const float4& q) { 1287 | // NOTE: assuming the quaternion is normalized 1288 | float x2 = q.x + q.x; 1289 | float y2 = q.y + q.y; 1290 | float z2 = q.z + q.z; 1291 | float xx2 = q.x * x2; 1292 | float xy2 = q.x * y2; 1293 | float xz2 = q.x * z2; 1294 | float yy2 = q.y * y2; 1295 | float yz2 = q.y * z2; 1296 | float zz2 = q.z * z2; 1297 | float wx2 = q.w * x2; 1298 | float wy2 = q.w * y2; 1299 | float wz2 = q.w * z2; 1300 | 1301 | ca[0] = float4(1.0f - (yy2 + zz2), xy2 + wz2, xz2 - wy2, 0.0f).xmm; 1302 | ca[1] = float4(xy2 - wz2, 1.0f - (xx2 + zz2), yz2 + wx2, 0.0f).xmm; 1303 | ca[2] = float4(xz2 + wy2, yz2 - wx2, 1.0f - (xx2 + yy2), 0.0f).xmm; 1304 | ca[3] = c_v4f_0001; 1305 | } 1306 | 1307 | ML_INLINE void SetupByRotationX(float angleX) { 1308 | float ct = cos(angleX); 1309 | float st = sin(angleX); 1310 | 1311 | ca[0] = float4(1.0f, 0.0f, 0.0f, 0.0f); 1312 | ca[1] = float4(0.0f, ct, st, 0.0f); 1313 | ca[2] = float4(0.0f, -st, ct, 0.0f); 1314 | ca[3] = c_v4f_0001; 1315 | } 1316 | 1317 | ML_INLINE void SetupByRotationY(float angleY) { 1318 | float ct = cos(angleY); 1319 | float st = sin(angleY); 1320 | 1321 | ca[0] = float4(ct, 0.0f, -st, 0.0f); 1322 | ca[1] = float4(0.0f, 1.0f, 0.0f, 0.0f); 1323 | ca[2] = float4(st, 0.0f, ct, 0.0f); 1324 | ca[3] = c_v4f_0001; 1325 | } 1326 | 1327 | ML_INLINE void SetupByRotationZ(float angleZ) { 1328 | float ct = cos(angleZ); 1329 | float st = sin(angleZ); 1330 | 1331 | ca[0] = float4(ct, st, 0.0f, 0.0f); 1332 | ca[1] = float4(-st, ct, 0.0f, 0.0f); 1333 | ca[2] = float4(0.0f, 0.0f, 1.0f, 0.0f); 1334 | ca[3] = c_v4f_0001; 1335 | } 1336 | 1337 | ML_INLINE void SetupByRotationYPR(float fYaw, float fPitch, float fRoll) { 1338 | // NOTE: "yaw-pitch-roll" rotation 1339 | // yaw - around Z (object "down-up" axis) 1340 | // pitch - around X (object "left-right" axis) 1341 | // roll - around Y (object "backward-forward" axis) 1342 | 1343 | /* 1344 | float4x4 rot; 1345 | rot.SetupByRotationY(fRoll); 1346 | *this = rot; 1347 | rot.SetupByRotationX(fPitch); 1348 | *this = rot * (*this); 1349 | rot.SetupByRotationZ(fYaw); 1350 | *this = rot * (*this); 1351 | */ 1352 | 1353 | float4 angles(fYaw, fPitch, fRoll, 0.0f); 1354 | 1355 | float4 c; 1356 | float4 s = _mm_sincos_ps(&c.xmm, angles.xmm); 1357 | 1358 | a00 = c.x * c.z - s.x * s.y * s.z; 1359 | a10 = s.x * c.z + c.x * s.y * s.z; 1360 | a20 = -c.y * s.z; 1361 | a30 = 0.0f; 1362 | 1363 | a01 = -s.x * c.y; 1364 | a11 = c.x * c.y; 1365 | a21 = s.y; 1366 | a31 = 0.0f; 1367 | 1368 | a02 = c.x * s.z + c.z * s.x * s.y; 1369 | a12 = s.x * s.z - c.x * s.y * c.z; 1370 | a22 = c.y * c.z; 1371 | a32 = 0.0f; 1372 | 1373 | ca[3] = c_v4f_0001; 1374 | } 1375 | 1376 | ML_INLINE void SetupByRotation(float theta, const float3& v) { 1377 | float ct = cos(theta); 1378 | float st = sin(theta); 1379 | 1380 | SetupByRotation(st, ct, v); 1381 | } 1382 | 1383 | ML_INLINE void SetupByRotation(float st, float ct, const float3& v) { 1384 | float xx = v.x * v.x; 1385 | float yy = v.y * v.y; 1386 | float zz = v.z * v.z; 1387 | float xy = v.x * v.y; 1388 | float xz = v.x * v.z; 1389 | float yz = v.y * v.z; 1390 | float ctxy = ct * xy; 1391 | float ctxz = ct * xz; 1392 | float ctyz = ct * yz; 1393 | float sty = st * v.y; 1394 | float stx = st * v.x; 1395 | float stz = st * v.z; 1396 | 1397 | a00 = xx + ct * (1.0f - xx); 1398 | a01 = xy - ctxy - stz; 1399 | a02 = xz - ctxz + sty; 1400 | 1401 | a10 = xy - ctxy + stz; 1402 | a11 = yy + ct * (1.0f - yy); 1403 | a12 = yz - ctyz - stx; 1404 | 1405 | a20 = xz - ctxz - sty; 1406 | a21 = yz - ctyz + stx; 1407 | a22 = zz + ct * (1.0f - zz); 1408 | 1409 | a30 = 0.0f; 1410 | a31 = 0.0f; 1411 | a32 = 0.0f; 1412 | 1413 | ca[3] = c_v4f_0001; 1414 | } 1415 | 1416 | ML_INLINE void SetupByRotation(const float3& z, const float3& d) { 1417 | /* 1418 | // NOTE: same as 1419 | 1420 | float3 axis = cross(z, d); 1421 | float angle = Acos( dot(z, d) ); 1422 | 1423 | SetupByRotation(angle, axis); 1424 | */ 1425 | 1426 | float3 w = cross(z, d); 1427 | float c = dot(z, d); 1428 | float k = (1.0f - c) / (1.0f - c * c); 1429 | 1430 | float hxy = w.x * w.y * k; 1431 | float hxz = w.x * w.z * k; 1432 | float hyz = w.y * w.z * k; 1433 | 1434 | a00 = c + w.x * w.x * k; 1435 | a01 = hxy - w.z; 1436 | a02 = hxz + w.y; 1437 | 1438 | a10 = hxy + w.z; 1439 | a11 = c + w.y * w.y * k; 1440 | a12 = hyz - w.x; 1441 | 1442 | a20 = hxz - w.y; 1443 | a21 = hyz + w.x; 1444 | a22 = c + w.z * w.z * k; 1445 | 1446 | a30 = 0.0f; 1447 | a31 = 0.0f; 1448 | a32 = 0.0f; 1449 | 1450 | ca[3] = c_v4f_0001; 1451 | } 1452 | 1453 | ML_INLINE void SetupByTranslation(const float3& p) { 1454 | ca[0] = float4(1.0f, 0.0f, 0.0f, 0.0f); 1455 | ca[1] = float4(0.0f, 1.0f, 0.0f, 0.0f); 1456 | ca[2] = float4(0.0f, 0.0f, 1.0f, 0.0f); 1457 | ca[3] = v4f_setw1(p); 1458 | } 1459 | 1460 | ML_INLINE void SetupByScale(const float3& scale) { 1461 | ca[0] = float4(scale.x, 0.0f, 0.0f, 0.0f); 1462 | ca[1] = float4(0.0f, scale.y, 0.0f, 0.0f); 1463 | ca[2] = float4(0.0f, 0.0f, scale.z, 0.0f); 1464 | ca[3] = c_v4f_0001; 1465 | } 1466 | 1467 | ML_INLINE void SetupByLookAt(const float3& vForward) { 1468 | float3 y = normalize(vForward); 1469 | float3 z = GetPerpendicularVector(y); 1470 | float3 x = cross(y, z); 1471 | 1472 | ca[0] = v4f_setw0(x); 1473 | ca[1] = v4f_setw0(y); 1474 | ca[2] = v4f_setw0(z); 1475 | ca[3] = c_v4f_0001; 1476 | } 1477 | 1478 | ML_INLINE void SetupByLookAt(const float3& vForward, const float3& vRight) { 1479 | float3 y = normalize(vForward); 1480 | float3 z = normalize(cross(vRight, y)); 1481 | float3 x = cross(y, z); 1482 | 1483 | ca[0] = v4f_setw0(x); 1484 | ca[1] = v4f_setw0(y); 1485 | ca[2] = v4f_setw0(z); 1486 | ca[3] = c_v4f_0001; 1487 | } 1488 | 1489 | ML_INLINE void SetupByOrthoProjection(float left, float right, float bottom, float top, float zNear, float zFar, uint32_t uiProjFlags = 0) { 1490 | ML_Assert(left < right); 1491 | ML_Assert(bottom < top); 1492 | 1493 | float rWidth = 1.0f / (right - left); 1494 | float rHeight = 1.0f / (top - bottom); 1495 | float rDepth = 1.0f / (zFar - zNear); 1496 | 1497 | a00 = 2.0f * rWidth; 1498 | a01 = 0.0f; 1499 | a02 = 0.0f; 1500 | a03 = -(right + left) * rWidth; 1501 | 1502 | a10 = 0.0f; 1503 | a11 = 2.0f * rHeight; 1504 | a12 = 0.0f; 1505 | a13 = -(top + bottom) * rHeight; 1506 | 1507 | a20 = 0.0f; 1508 | a21 = 0.0f; 1509 | a22 = -2.0f * rDepth; 1510 | a23 = -(zFar + zNear) * rDepth; 1511 | 1512 | a30 = 0.0f; 1513 | a31 = 0.0f; 1514 | a32 = 0.0f; 1515 | a33 = 1.0f; 1516 | 1517 | bool bReverseZ = (uiProjFlags & PROJ_REVERSED_Z) != 0; 1518 | 1519 | a22 = ML_ModifyProjZ(bReverseZ, a22, a32); 1520 | a23 = ML_ModifyProjZ(bReverseZ, a23, a33); 1521 | 1522 | if (uiProjFlags & PROJ_LEFT_HANDED) 1523 | ca[2] = v4f_negate(ca[2]); 1524 | } 1525 | 1526 | ML_INLINE void SetupByFrustum(float left, float right, float bottom, float top, float zNear, float zFar, uint32_t uiProjFlags = 0) { 1527 | ML_Assert(left < right); 1528 | ML_Assert(bottom < top); 1529 | 1530 | float rWidth = 1.0f / (right - left); 1531 | float rHeight = 1.0f / (top - bottom); 1532 | float rDepth = 1.0f / (zNear - zFar); 1533 | 1534 | a00 = 2.0f * zNear * rWidth; 1535 | a01 = 0.0f; 1536 | a02 = (right + left) * rWidth; 1537 | a03 = 0.0f; 1538 | 1539 | a10 = 0.0f; 1540 | a11 = 2.0f * zNear * rHeight; 1541 | a12 = (top + bottom) * rHeight; 1542 | a13 = 0.0f; 1543 | 1544 | a20 = 0.0f; 1545 | a21 = 0.0f; 1546 | a22 = (zFar + zNear) * rDepth; 1547 | a23 = 2.0f * zFar * zNear * rDepth; 1548 | 1549 | a30 = 0.0f; 1550 | a31 = 0.0f; 1551 | a32 = -1.0f; 1552 | a33 = 0.0f; 1553 | 1554 | bool bReverseZ = (uiProjFlags & PROJ_REVERSED_Z) != 0; 1555 | 1556 | a22 = ML_ModifyProjZ(bReverseZ, a22, a32); 1557 | a23 = ML_ModifyProjZ(bReverseZ, a23, a33); 1558 | 1559 | if (uiProjFlags & PROJ_LEFT_HANDED) 1560 | ca[2] = v4f_negate(ca[2]); 1561 | } 1562 | 1563 | ML_INLINE void SetupByFrustumInf(float left, float right, float bottom, float top, float zNear, uint32_t uiProjFlags = 0) { 1564 | ML_Assert(left < right); 1565 | ML_Assert(bottom < top); 1566 | 1567 | float rWidth = 1.0f / (right - left); 1568 | float rHeight = 1.0f / (top - bottom); 1569 | 1570 | a00 = 2.0f * zNear * rWidth; 1571 | a01 = 0.0f; 1572 | a02 = (right + left) * rWidth; 1573 | a03 = 0.0f; 1574 | 1575 | a10 = 0.0f; 1576 | a11 = 2.0f * zNear * rHeight; 1577 | a12 = (top + bottom) * rHeight; 1578 | a13 = 0.0f; 1579 | 1580 | a20 = 0.0f; 1581 | a21 = 0.0f; 1582 | a22 = -1.0f; 1583 | a23 = -2.0f * zNear; 1584 | 1585 | a30 = 0.0f; 1586 | a31 = 0.0f; 1587 | a32 = -1.0f; 1588 | a33 = 0.0f; 1589 | 1590 | bool bReverseZ = (uiProjFlags & PROJ_REVERSED_Z) != 0; 1591 | 1592 | a22 = ML_ModifyProjZ(bReverseZ, a22, a32); 1593 | a23 = ML_ModifyProjZ(bReverseZ, a23, a33); 1594 | 1595 | if (uiProjFlags & PROJ_LEFT_HANDED) 1596 | ca[2] = v4f_negate(ca[2]); 1597 | } 1598 | 1599 | ML_INLINE void SetupByHalfFovy(float halfFovy, float aspect, float zNear, float zFar, uint32_t uiProjFlags = 0) { 1600 | float ymax = zNear * tan(halfFovy); 1601 | float xmax = ymax * aspect; 1602 | 1603 | SetupByFrustum(-xmax, xmax, -ymax, ymax, zNear, zFar, uiProjFlags); 1604 | } 1605 | 1606 | ML_INLINE void SetupByHalfFovyInf(float halfFovy, float aspect, float zNear, uint32_t uiProjFlags = 0) { 1607 | float ymax = zNear * tan(halfFovy); 1608 | float xmax = ymax * aspect; 1609 | 1610 | SetupByFrustumInf(-xmax, xmax, -ymax, ymax, zNear, uiProjFlags); 1611 | } 1612 | 1613 | ML_INLINE void SetupByHalfFovx(float halfFovx, float aspect, float zNear, float zFar, uint32_t uiProjFlags = 0) { 1614 | float xmax = zNear * tan(halfFovx); 1615 | float ymax = xmax / aspect; 1616 | 1617 | SetupByFrustum(-xmax, xmax, -ymax, ymax, zNear, zFar, uiProjFlags); 1618 | } 1619 | 1620 | ML_INLINE void SetupByHalfFovxInf(float halfFovx, float aspect, float zNear, uint32_t uiProjFlags = 0) { 1621 | float xmax = zNear * tan(halfFovx); 1622 | float ymax = xmax / aspect; 1623 | 1624 | SetupByFrustumInf(-xmax, xmax, -ymax, ymax, zNear, uiProjFlags); 1625 | } 1626 | 1627 | ML_INLINE void SetupByAngles(float angleMinx, float angleMaxx, float angleMiny, float angleMaxy, float zNear, float zFar, uint32_t uiProjFlags = 0) { 1628 | float xmin = tan(angleMinx) * zNear; 1629 | float xmax = tan(angleMaxx) * zNear; 1630 | float ymin = tan(angleMiny) * zNear; 1631 | float ymax = tan(angleMaxy) * zNear; 1632 | 1633 | SetupByFrustum(xmin, xmax, ymin, ymax, zNear, zFar, uiProjFlags); 1634 | } 1635 | 1636 | ML_INLINE void SetupByAnglesInf(float angleMinx, float angleMaxx, float angleMiny, float angleMaxy, float zNear, uint32_t uiProjFlags = 0) { 1637 | float xmin = tan(angleMinx) * zNear; 1638 | float xmax = tan(angleMaxx) * zNear; 1639 | float ymin = tan(angleMiny) * zNear; 1640 | float ymax = tan(angleMaxy) * zNear; 1641 | 1642 | SetupByFrustumInf(xmin, xmax, ymin, ymax, zNear, uiProjFlags); 1643 | } 1644 | 1645 | ML_INLINE void SubsampleProjection(float dx, float dy, uint32_t viewportWidth, uint32_t viewportHeight) { 1646 | // NOTE: dx/dy in range [-1; 1] 1647 | 1648 | a02 += dx / float(viewportWidth); 1649 | a12 += dy / float(viewportHeight); 1650 | } 1651 | 1652 | ML_INLINE bool IsProjectionValid() const { 1653 | // Do not check a20 and a21 to allow off-centered projections 1654 | // Do not check a22 to allow reverse infinite projections 1655 | 1656 | return ((a00 != 0.0f && a10 == 0.0f && a20 == 0.0f && a30 == 0.0f) && (a01 == 0.0f && a11 != 0.0f && a21 == 0.0f && a31 == 0.0f) && (a32 == 1.0f || a32 == -1.0f) && (a03 == 0.0f && a13 == 0.0f && a23 != 0.0f && a33 == 0.0f)); 1657 | } 1658 | }; 1659 | 1660 | ML_INLINE float4 mul(const float4x4& m, const float4& v) { 1661 | return m * v; 1662 | } 1663 | 1664 | ML_INLINE float4x4 transpose(const float4x4& m) { 1665 | float4x4 res; 1666 | m.TransposeTo(res); 1667 | 1668 | return res; 1669 | } 1670 | 1671 | // non-HLSL 1672 | 1673 | ML_INLINE float3 Rotate(const float4x4& m, const float3& v) { 1674 | v4f r = _mm_mul_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), m.ca[0]); 1675 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), m.ca[1], r); 1676 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), m.ca[2], r); 1677 | r = v4f_setw0(r); 1678 | 1679 | return r; 1680 | } 1681 | 1682 | ML_INLINE float3 RotateAbs(const float4x4& m, const float3& v) { 1683 | v4f col0_abs = v4f_abs(m.ca[0]); 1684 | v4f col1_abs = v4f_abs(m.ca[1]); 1685 | v4f col2_abs = v4f_abs(m.ca[2]); 1686 | 1687 | v4f r = _mm_mul_ps(v4f_swizzle(v.xmm, 0, 0, 0, 0), col0_abs); 1688 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 1, 1, 1, 1), col1_abs, r); 1689 | r = _mm_fmadd_ps(v4f_swizzle(v.xmm, 2, 2, 2, 2), col2_abs, r); 1690 | 1691 | return r; 1692 | } 1693 | 1694 | ML_INLINE float3 Project(const float3& v, const float4x4& m) { 1695 | float4 clip = (m * v).xmm; 1696 | clip /= clip.w; 1697 | 1698 | return clip.xyz; 1699 | } 1700 | 1701 | ML_INLINE void float4x4::PreTranslation(const float3& p) { 1702 | v4f r = Rotate(*this, p.xmm).xmm; 1703 | ca[3] = _mm_add_ps(ca[3], r); 1704 | } 1705 | 1706 | ML_INLINE void float4x4::InvertOrtho() { 1707 | Transpose3x4(); 1708 | 1709 | ca[3] = Rotate(*this, float3(ca[3])).xmm; 1710 | ca[3] = v4f_negate(ca[3]); 1711 | 1712 | ca[0] = v4f_setw0(ca[0]); 1713 | ca[1] = v4f_setw0(ca[1]); 1714 | ca[2] = v4f_setw0(ca[2]); 1715 | ca[3] = v4f_setw1(ca[3]); 1716 | } 1717 | 1718 | //====================================================================================================================== 1719 | // cBoxf 1720 | //====================================================================================================================== 1721 | 1722 | struct cBoxf { 1723 | float3 vMin; 1724 | float3 vMax; 1725 | 1726 | public: 1727 | ML_INLINE cBoxf() { 1728 | Clear(); 1729 | } 1730 | 1731 | ML_INLINE cBoxf(const float3& v) { 1732 | vMin = v; 1733 | vMax = v; 1734 | } 1735 | 1736 | ML_INLINE cBoxf(const float3& minv, const float3& maxv) { 1737 | vMin = minv; 1738 | vMax = maxv; 1739 | } 1740 | 1741 | ML_INLINE void Clear() { 1742 | vMin = float3(c_v4f_Inf); 1743 | vMax = float3(c_v4f_InfMinus); 1744 | } 1745 | 1746 | ML_INLINE bool IsValid() const { 1747 | v4f r = _mm_cmplt_ps(vMin.xmm, vMax.xmm); 1748 | 1749 | return v4f_test3_all(r); 1750 | } 1751 | 1752 | ML_INLINE float3 GetCenter() const { 1753 | return (vMin + vMax) * 0.5f; 1754 | } 1755 | 1756 | ML_INLINE float GetRadius() const { 1757 | return length(vMax - vMin) * 0.5f; 1758 | } 1759 | 1760 | ML_INLINE void Scale(float fScale) { 1761 | fScale *= 0.5f; 1762 | 1763 | float k1 = 0.5f + fScale; 1764 | float k2 = 0.5f - fScale; 1765 | 1766 | float3 a = vMin * k1 + vMax * k2; 1767 | float3 b = vMax * k1 + vMin * k2; 1768 | 1769 | vMin = a; 1770 | vMax = b; 1771 | } 1772 | 1773 | ML_INLINE void Enlarge(const float3& vBorder) { 1774 | vMin -= vBorder; 1775 | vMax += vBorder; 1776 | } 1777 | 1778 | ML_INLINE void Add(const float3& v) { 1779 | vMin = _mm_min_ps(vMin.xmm, v.xmm); 1780 | vMax = _mm_max_ps(vMax.xmm, v.xmm); 1781 | } 1782 | 1783 | ML_INLINE void Add(const cBoxf& b) { 1784 | vMin = _mm_min_ps(vMin.xmm, b.vMin.xmm); 1785 | vMax = _mm_max_ps(vMax.xmm, b.vMax.xmm); 1786 | } 1787 | 1788 | ML_INLINE float DistanceSquared(const float3& from) const { 1789 | v4f p = v4f_clamp(from.xmm, vMin.xmm, vMax.xmm); 1790 | p = _mm_sub_ps(p, from.xmm); 1791 | p = v4f_dot33(p, p); 1792 | 1793 | return _mm_cvtss_f32(p); 1794 | } 1795 | 1796 | ML_INLINE float Distance(const float3& from) const { 1797 | v4f p = v4f_clamp(from.xmm, vMin.xmm, vMax.xmm); 1798 | p = _mm_sub_ps(p, from.xmm); 1799 | p = v4f_length(p); 1800 | 1801 | return _mm_cvtss_f32(p); 1802 | } 1803 | 1804 | ML_INLINE bool IsIntersectWith(const cBoxf& b) const { 1805 | v4f r = _mm_cmplt_ps(vMax.xmm, b.vMin.xmm); 1806 | r = _mm_or_ps(r, _mm_cmpgt_ps(vMin.xmm, b.vMax.xmm)); 1807 | 1808 | return v4f_test3_none(r); 1809 | } 1810 | 1811 | // NOTE: intersection state 'b' vs 'this' 1812 | 1813 | ML_INLINE eClip GetIntersectionState(const cBoxf& b) const { 1814 | if (!IsIntersectWith(b)) 1815 | return CLIP_OUT; 1816 | 1817 | v4f r = _mm_cmplt_ps(vMin.xmm, b.vMin.xmm); 1818 | r = _mm_and_ps(r, _mm_cmpgt_ps(vMax.xmm, b.vMax.xmm)); 1819 | 1820 | return v4f_test3_all(r) ? CLIP_IN : CLIP_PARTIAL; 1821 | } 1822 | 1823 | ML_INLINE bool IsContain(const float3& p) const { 1824 | v4f r = _mm_cmplt_ps(p.xmm, vMin.xmm); 1825 | r = _mm_or_ps(r, _mm_cmpgt_ps(p.xmm, vMax.xmm)); 1826 | 1827 | return v4f_test3_none(r); 1828 | } 1829 | 1830 | ML_INLINE bool IsContainSphere(const float3& center, float radius) const { 1831 | v4f r = _mm_set1_ps(radius); 1832 | v4f t = _mm_sub_ps(vMin.xmm, r); 1833 | t = _mm_cmplt_ps(center.xmm, t); 1834 | 1835 | if (v4f_test3_any(t)) 1836 | return false; 1837 | 1838 | t = _mm_add_ps(vMax.xmm, r); 1839 | t = _mm_cmpgt_ps(center.xmm, t); 1840 | 1841 | if (v4f_test3_any(t)) 1842 | return false; 1843 | 1844 | return true; 1845 | } 1846 | 1847 | ML_INLINE uint32_t GetIntersectionBits(const cBoxf& b) const { 1848 | v4f r = _mm_cmpge_ps(b.vMin.xmm, vMin.xmm); 1849 | uint32_t bits = (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0)); 1850 | 1851 | r = _mm_cmple_ps(b.vMax.xmm, vMax.xmm); 1852 | bits |= (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0)) << 3; 1853 | 1854 | return bits; 1855 | } 1856 | 1857 | ML_INLINE uint32_t IsContain(const float3& p, uint32_t bits) const { 1858 | v4f r = _mm_cmpge_ps(p.xmm, vMin.xmm); 1859 | bits |= (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0)); 1860 | 1861 | r = _mm_cmple_ps(p.xmm, vMax.xmm); 1862 | bits |= (_mm_movemask_ps(r) & ML_Mask(1, 1, 1, 0)) << 3; 1863 | 1864 | return bits; 1865 | } 1866 | 1867 | ML_INLINE bool IsIntersectWith(const float3& vRayPos, const float3& vRayDir, float* out_fTmin, float* out_fTmax) const { 1868 | // NOTE: http://tavianator.com/2011/05/fast-branchless-raybounding-box-intersections/ 1869 | 1870 | // IMPORTANT: store '1 / ray_dir' and filter INFs out! 1871 | 1872 | v4f t1 = _mm_div_ps(_mm_sub_ps(vMin.xmm, vRayPos.xmm), vRayDir.xmm); 1873 | v4f t2 = _mm_div_ps(_mm_sub_ps(vMax.xmm, vRayPos.xmm), vRayDir.xmm); 1874 | 1875 | v4f vmin = _mm_min_ps(t1, t2); 1876 | v4f vmax = _mm_max_ps(t1, t2); 1877 | 1878 | // NOTE: hmax.xxx 1879 | v4f tmin = _mm_max_ps(vmin, v4f_swizzle(vmin, ML_Y, ML_Z, ML_X, 0)); 1880 | tmin = _mm_max_ps(tmin, v4f_swizzle(vmin, ML_Z, ML_X, ML_Y, 0)); 1881 | 1882 | // NOTE: hmin.xxx 1883 | v4f tmax = _mm_min_ps(vmax, v4f_swizzle(vmax, ML_Y, ML_Z, ML_X, 0)); 1884 | tmax = _mm_min_ps(tmax, v4f_swizzle(vmax, ML_Z, ML_X, ML_Y, 0)); 1885 | 1886 | v4f_store_x(out_fTmin, tmin); 1887 | v4f_store_x(out_fTmax, tmax); 1888 | 1889 | v4f cmp = _mm_cmpge_ps(tmax, tmin); 1890 | 1891 | return (_mm_movemask_ps(cmp) & ML_Mask(1, 0, 0, 0)) == ML_Mask(1, 0, 0, 0); 1892 | } 1893 | }; 1894 | 1895 | ML_INLINE void TransformAabb(const float4x4& mTransform, const cBoxf& src, cBoxf& dst) { 1896 | float3 center = (src.vMin + src.vMax) * 0.5f; 1897 | float3 extends = src.vMax - center; 1898 | 1899 | center = mTransform * center; 1900 | extends = RotateAbs(mTransform, extends); 1901 | 1902 | dst.vMin = center - extends; 1903 | dst.vMax = center + extends; 1904 | } 1905 | -------------------------------------------------------------------------------- /Guts/i32.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | //====================================================================================================================== 6 | // int2 7 | //====================================================================================================================== 8 | 9 | union int2 { 10 | v2i mm; 11 | 12 | struct { 13 | int32_t a[COORD_2D]; 14 | }; 15 | 16 | struct { 17 | int32_t x, y; 18 | }; 19 | 20 | ML_SWIZZLE_2(int2, int32_t); 21 | 22 | public: 23 | ML_INLINE int2() 24 | : mm(0) { 25 | } 26 | 27 | ML_INLINE int2(int32_t c) 28 | : x(c), y(c) { 29 | } 30 | 31 | ML_INLINE int2(int32_t _x, int32_t _y) 32 | : x(_x), y(_y) { 33 | } 34 | 35 | ML_INLINE int2(const int2& v) = default; 36 | 37 | // Set 38 | 39 | ML_INLINE void operator=(const int2& v) { 40 | mm = v.mm; 41 | } 42 | 43 | // Conversion 44 | 45 | ML_INLINE operator uint2() const; 46 | ML_INLINE operator float2() const; 47 | ML_INLINE operator double2() const; 48 | 49 | // Compare 50 | 51 | ML_COMPARE_UNOPT(bool2, int2, <) 52 | ML_COMPARE_UNOPT(bool2, int2, <=) 53 | ML_COMPARE_UNOPT(bool2, int2, ==) 54 | ML_COMPARE_UNOPT(bool2, int2, >=) 55 | ML_COMPARE_UNOPT(bool2, int2, >) 56 | ML_COMPARE_UNOPT(bool2, int2, !=) 57 | 58 | // Ops 59 | 60 | ML_INLINE int2 operator-() const { 61 | return int2(-x, -y); 62 | } 63 | 64 | ML_OP_UNOPT(int2, int32_t, -, -=) 65 | ML_OP_UNOPT(int2, int32_t, +, +=) 66 | ML_OP_UNOPT(int2, int32_t, *, *=) 67 | ML_OP_UNOPT(int2, int32_t, /, /=) 68 | ML_OP_UNOPT(int2, int32_t, %, %=) 69 | ML_OP_UNOPT(int2, int32_t, <<, <<=) 70 | ML_OP_UNOPT(int2, int32_t, >>, >>=) 71 | ML_OP_UNOPT(int2, int32_t, &, &=) 72 | ML_OP_UNOPT(int2, int32_t, |, |=) 73 | ML_OP_UNOPT(int2, int32_t, ^, ^=) 74 | }; 75 | 76 | ML_INLINE int2 min(const int2& x, const int2& y) { 77 | return int2(min(x.x, y.x), min(x.y, y.y)); 78 | } 79 | 80 | ML_INLINE int2 max(const int2& x, const int2& y) { 81 | return int2(max(x.x, y.x), max(x.y, y.y)); 82 | } 83 | 84 | //====================================================================================================================== 85 | // int3 86 | //====================================================================================================================== 87 | 88 | union int3 { 89 | v4i xmm; 90 | 91 | struct { 92 | int32_t a[COORD_3D]; 93 | }; 94 | 95 | struct { 96 | int32_t x, y, z; 97 | }; 98 | 99 | ML_SWIZZLE_3(v4i_swizzle2, int2, v4i_swizzle3, int3); 100 | 101 | public: 102 | ML_INLINE int3() 103 | : xmm(_mm_setzero_si128()) { 104 | } 105 | 106 | ML_INLINE int3(int32_t c) 107 | : xmm(_mm_set1_epi32(c)) { 108 | } 109 | 110 | ML_INLINE int3(int32_t _x, int32_t _y, int32_t _z) 111 | : xmm(v4i_set(_x, _y, _z, 1)) { 112 | } 113 | 114 | ML_INLINE int3(const int2& v, int32_t _z) 115 | : xmm(v4i_set(v.x, v.y, _z, 1)) { 116 | } 117 | 118 | ML_INLINE int3(int32_t _x, const int2& v) 119 | : xmm(v4i_set(_x, v.x, v.y, 1)) { 120 | } 121 | 122 | ML_INLINE int3(const v4i& v) 123 | : xmm(v) { 124 | } 125 | 126 | ML_INLINE int3(const int32_t* v3) 127 | : xmm(v4i_set(v3[0], v3[1], v3[2], 1)) { 128 | } 129 | 130 | ML_INLINE int3(const int3& v) = default; 131 | 132 | // Set 133 | 134 | ML_INLINE void operator=(const int3& v) { 135 | xmm = v.xmm; 136 | } 137 | 138 | // Conversion 139 | 140 | ML_INLINE operator uint3() const; 141 | ML_INLINE operator float3() const; 142 | ML_INLINE operator double3() const; 143 | 144 | // Compare 145 | 146 | ML_COMPARE(bool3, int3, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm) 147 | ML_COMPARE(bool3, int3, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm) 148 | ML_COMPARE(bool3, int3, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm) 149 | ML_COMPARE(bool3, int3, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm) 150 | ML_COMPARE(bool3, int3, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm) 151 | ML_COMPARE(bool3, int3, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm) 152 | 153 | // Ops 154 | 155 | ML_INLINE int3 operator-() const { 156 | return _mm_xor_si128(xmm, _mm_set1_epi32(0x80000000)); 157 | } 158 | 159 | ML_OP(int3, int32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm) 160 | ML_OP(int3, int32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm) 161 | ML_OP(int3, int32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm) 162 | ML_OP(int3, int32_t, /, /=, _mm_div_epi32, _mm_set1_epi32, xmm) 163 | ML_OP(int3, int32_t, %, %=, v4i_mod, _mm_set1_epi32, xmm) 164 | ML_OP(int3, int32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm) 165 | ML_OP(int3, int32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm) 166 | ML_OP(int3, int32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm) 167 | ML_OP(int3, int32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm) 168 | ML_OP(int3, int32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm) 169 | 170 | // Misc 171 | 172 | ML_INLINE operator v4i() const { 173 | return xmm; 174 | } 175 | 176 | static ML_INLINE int3 Zero() { 177 | return _mm_setzero_si128(); 178 | } 179 | }; 180 | 181 | ML_INLINE int3 min(const int3& x, const int3& y) { 182 | return _mm_min_epi32(x.xmm, y.xmm); 183 | } 184 | 185 | ML_INLINE int3 max(const int3& x, const int3& y) { 186 | return _mm_max_epi32(x.xmm, y.xmm); 187 | } 188 | 189 | //====================================================================================================================== 190 | // int4 191 | //====================================================================================================================== 192 | 193 | union int4 { 194 | v4i xmm; 195 | 196 | struct { 197 | int32_t a[COORD_4D]; 198 | }; 199 | 200 | struct { 201 | int32_t x, y, z, w; 202 | }; 203 | 204 | ML_SWIZZLE_4(v4i_swizzle2, int2, v4i_swizzle3, int3, v4i_swizzle4, int4); 205 | 206 | public: 207 | ML_INLINE int4() 208 | : xmm(_mm_setzero_si128()) { 209 | } 210 | 211 | ML_INLINE int4(int32_t c) 212 | : xmm(_mm_set1_epi32(c)) { 213 | } 214 | 215 | ML_INLINE int4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) 216 | : xmm(v4i_set(_x, _y, _z, _w)) { 217 | } 218 | 219 | ML_INLINE int4(const int3& v, int32_t _w) 220 | : xmm(v4i_set(v.x, v.y, v.z, _w)) { 221 | } 222 | 223 | ML_INLINE int4(const int2& a, const int2& b) 224 | : xmm(v4i_set(a.x, a.y, b.x, b.y)) { 225 | } 226 | 227 | ML_INLINE int4(int32_t _x, const int3& v) 228 | : xmm(v4i_set(_x, v.x, v.y, v.z)) { 229 | } 230 | 231 | ML_INLINE int4(const v4i& v) 232 | : xmm(v) { 233 | } 234 | 235 | ML_INLINE int4(const int4& v) = default; 236 | 237 | // Set 238 | 239 | ML_INLINE void operator=(const int4& v) { 240 | xmm = v.xmm; 241 | } 242 | 243 | // Compare 244 | 245 | ML_COMPARE(bool4, int4, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm) 246 | ML_COMPARE(bool4, int4, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm) 247 | ML_COMPARE(bool4, int4, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm) 248 | ML_COMPARE(bool4, int4, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm) 249 | ML_COMPARE(bool4, int4, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm) 250 | ML_COMPARE(bool4, int4, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm) 251 | 252 | // Conversion 253 | 254 | ML_INLINE operator uint4() const; 255 | ML_INLINE operator float4() const; 256 | ML_INLINE operator double4() const; 257 | 258 | // Ops 259 | 260 | ML_INLINE int4 operator-() const { 261 | return _mm_xor_si128(xmm, _mm_set1_epi32(0x80000000)); 262 | } 263 | 264 | ML_OP(int4, int32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm) 265 | ML_OP(int4, int32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm) 266 | ML_OP(int4, int32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm) 267 | ML_OP(int4, int32_t, /, /=, _mm_div_epi32, _mm_set1_epi32, xmm) 268 | ML_OP(int4, int32_t, %, %=, v4i_mod, _mm_set1_epi32, xmm) 269 | ML_OP(int4, int32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm) 270 | ML_OP(int4, int32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm) 271 | ML_OP(int4, int32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm) 272 | ML_OP(int4, int32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm) 273 | ML_OP(int4, int32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm) 274 | 275 | // Misc 276 | 277 | ML_INLINE operator v4i() const { 278 | return xmm; 279 | } 280 | 281 | static ML_INLINE int4 Zero() { 282 | return _mm_setzero_si128(); 283 | } 284 | }; 285 | 286 | ML_INLINE int4 min(const int4& x, const int4& y) { 287 | return _mm_min_epi32(x.xmm, y.xmm); 288 | } 289 | 290 | ML_INLINE int4 max(const int4& x, const int4& y) { 291 | return _mm_max_epi32(x.xmm, y.xmm); 292 | } 293 | -------------------------------------------------------------------------------- /Guts/other.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | //====================================================================================================================== 6 | // Other 7 | //====================================================================================================================== 8 | 9 | ML_INLINE float SplitZ_Logarithmic(uint32_t i, uint32_t splits, float fZnear, float fZfar) { 10 | float ratio = fZfar / fZnear; 11 | float k = float(i) / float(splits); 12 | float z = fZnear * pow(ratio, k); 13 | 14 | return z; 15 | } 16 | 17 | ML_INLINE float SplitZ_Uniform(uint32_t i, uint32_t splits, float fZnear, float fZfar) { 18 | float delta = fZfar - fZnear; 19 | float k = float(i) / float(splits); 20 | float z = fZnear + delta * k; 21 | 22 | return z; 23 | } 24 | 25 | ML_INLINE float SplitZ_Mixed(uint32_t i, uint32_t splits, float fZnear, float fZfar, float lambda) { 26 | float z_log = SplitZ_Logarithmic(i, splits, fZnear, fZfar); 27 | float z_uni = SplitZ_Uniform(i, splits, fZnear, fZfar); 28 | float z = lerp(z_log, z_uni, lambda); 29 | 30 | return z; 31 | } 32 | 33 | ML_INLINE uint32_t GreatestCommonDivisor(uint32_t a, uint32_t b) { 34 | while (a && b) { 35 | if (a >= b) 36 | a = a % b; 37 | else 38 | b = b % a; 39 | } 40 | 41 | return a + b; 42 | } 43 | 44 | ML_INLINE uint32_t LeastCommonMultiple(uint32_t a, uint32_t b) { 45 | return (a * b) / GreatestCommonDivisor(a, b); 46 | } 47 | 48 | //====================================================================================================================== 49 | // Ray-Triangle/AABB 50 | //====================================================================================================================== 51 | 52 | // NOTE: overlapping axis-aligned boundary box and triangle (center - aabb center, extents - half size) 53 | // NOTE: AABB-triangle overlap test code by Tomas Akenine-Moller 54 | // http://fileadmin.cs.lth.se/cs/Personal/Tomas_Akenine-Moller/code/ 55 | // SSE code from http://www.codercorner.com/blog/?p=1118 56 | 57 | ML_INLINE uint32_t TestClassIII(const v4f& e0V, const v4f& v0V, const v4f& v1V, const v4f& v2V, const v4f& extents) { 58 | v4f fe0ZYX_V = v4f_abs(e0V); 59 | 60 | v4f e0XZY_V = v4f_swizzle(e0V, 1, 2, 0, 3); 61 | v4f v0XZY_V = v4f_swizzle(v0V, 1, 2, 0, 3); 62 | v4f v1XZY_V = v4f_swizzle(v1V, 1, 2, 0, 3); 63 | v4f v2XZY_V = v4f_swizzle(v2V, 1, 2, 0, 3); 64 | v4f fe0XZY_V = v4f_swizzle(fe0ZYX_V, 1, 2, 0, 3); 65 | v4f extentsXZY_V = v4f_swizzle(extents, 1, 2, 0, 3); 66 | 67 | v4f radV = _mm_add_ps(_mm_mul_ps(extents, fe0XZY_V), _mm_mul_ps(extentsXZY_V, fe0ZYX_V)); 68 | v4f p0V = _mm_sub_ps(_mm_mul_ps(v0V, e0XZY_V), _mm_mul_ps(v0XZY_V, e0V)); 69 | v4f p1V = _mm_sub_ps(_mm_mul_ps(v1V, e0XZY_V), _mm_mul_ps(v1XZY_V, e0V)); 70 | v4f p2V = _mm_sub_ps(_mm_mul_ps(v2V, e0XZY_V), _mm_mul_ps(v2XZY_V, e0V)); 71 | 72 | v4f minV = _mm_min_ps(_mm_min_ps(p0V, p1V), p2V); 73 | v4f maxV = _mm_max_ps(_mm_max_ps(p0V, p1V), p2V); 74 | 75 | uint32_t test = _mm_movemask_ps(_mm_cmpgt_ps(minV, radV)); 76 | radV = _mm_sub_ps(_mm_setzero_ps(), radV); 77 | test |= _mm_movemask_ps(_mm_cmpgt_ps(radV, maxV)); 78 | 79 | return test & 7; 80 | } 81 | 82 | ML_INLINE bool IsOverlapBoxTriangle(const float3& boxcenter, const float3& extents, const float3& p0, const float3& p1, const float3& p2) { 83 | v4f v0V = _mm_sub_ps(p0.xmm, boxcenter.xmm); 84 | v4f cV = v4f_abs(v0V); 85 | uint32_t test = _mm_movemask_ps(_mm_sub_ps(cV, extents.xmm)); 86 | 87 | if ((test & 7) == 7) 88 | return true; 89 | 90 | v4f v1V = _mm_sub_ps(p1.xmm, boxcenter.xmm); 91 | v4f v2V = _mm_sub_ps(p2.xmm, boxcenter.xmm); 92 | v4f minV = _mm_min_ps(v0V, v1V); 93 | minV = _mm_min_ps(minV, v2V); 94 | test = _mm_movemask_ps(_mm_cmpgt_ps(minV, extents.xmm)); 95 | 96 | if (test & 7) 97 | return false; 98 | 99 | v4f maxV = _mm_max_ps(v0V, v1V); 100 | maxV = _mm_max_ps(maxV, v2V); 101 | cV = _mm_sub_ps(_mm_setzero_ps(), extents.xmm); 102 | test = _mm_movemask_ps(_mm_cmpgt_ps(cV, maxV)); 103 | 104 | if (test & 7) 105 | return false; 106 | 107 | v4f e0V = _mm_sub_ps(v1V, v0V); 108 | v4f e1V = _mm_sub_ps(v2V, v1V); 109 | v4f normalV = v4f_cross(e0V, e1V); 110 | v4f dV = v4f_dot33(normalV, v0V); 111 | 112 | v4f normalSignsV = _mm_and_ps(normalV, c_v4f_Sign); 113 | maxV = _mm_or_ps(extents.xmm, normalSignsV); 114 | 115 | v4f tmpV = v4f_dot33(normalV, maxV); 116 | test = _mm_movemask_ps(_mm_cmpgt_ps(dV, tmpV)); 117 | 118 | if (test & 7) 119 | return false; 120 | 121 | normalSignsV = _mm_xor_ps(normalSignsV, c_v4f_Sign); 122 | minV = _mm_or_ps(extents.xmm, normalSignsV); 123 | 124 | tmpV = v4f_dot33(normalV, minV); 125 | test = _mm_movemask_ps(_mm_cmpgt_ps(tmpV, dV)); 126 | 127 | if (test & 7) 128 | return false; 129 | 130 | if (TestClassIII(e0V, v0V, v1V, v2V, extents.xmm)) 131 | return false; 132 | 133 | if (TestClassIII(e1V, v0V, v1V, v2V, extents.xmm)) 134 | return false; 135 | 136 | v4f e2V = _mm_sub_ps(v0V, v2V); 137 | 138 | if (TestClassIII(e2V, v0V, v1V, v2V, extents.xmm)) 139 | return false; 140 | 141 | return true; 142 | } 143 | 144 | // NOTE: barycentric ray-triangle test by Tomas Akenine-Moller 145 | ML_INLINE bool IsIntersectRayTriangle(const float3& origin, const float3& dir, const float3& v1, const float3& v2, const float3& v3, float3& out_tuv) { 146 | // find vectors for two edges sharing vert0 147 | float3 e1 = v2 - v1; 148 | float3 e2 = v3 - v1; 149 | 150 | // begin calculating determinant - also used to calculate U parameter 151 | float3 pvec = cross(dir, e2); 152 | 153 | // if determinant is near zero, ray lies in plane of triangle 154 | float det = dot(e1, pvec); 155 | 156 | if (det < -1e-6f) 157 | return false; 158 | 159 | // calculate distance from vert0 to ray origin 160 | float3 tvec = origin - v1; 161 | 162 | // calculate U parameter and test bounds 163 | float u = dot(tvec, pvec); 164 | 165 | if (u < 0.0f || u > det) 166 | return false; 167 | 168 | // prepare to test V parameter 169 | float3 qvec = cross(tvec, e1); 170 | 171 | // calculate V parameter and test bounds 172 | float v = dot(dir, qvec); 173 | 174 | if (v < 0.0f || u + v > det) 175 | return false; 176 | 177 | // calculate t, scale parameters, ray intersects triangle 178 | out_tuv.x = dot(e2, qvec); 179 | out_tuv.y = u; // v 180 | out_tuv.z = v; // 1 - (u + v) 181 | 182 | out_tuv /= det; 183 | 184 | return true; 185 | } 186 | 187 | ML_INLINE bool IsIntersectRayTriangle(const float3& from, const float3& to, const float3& v1, const float3& v2, const float3& v3, float3& out_intersection, float3& out_normal) { 188 | // find vectors for two edges sharing vert0 189 | float3 e1 = v2 - v1; 190 | float3 e2 = v3 - v1; 191 | 192 | // begin calculating determinant - also used to calculate U parameter 193 | float3 dir = to - from; 194 | float len = length(dir); 195 | dir = normalize(dir); 196 | 197 | float3 pvec = cross(dir, e2); 198 | 199 | // if determinant is near zero, ray lies in plane of triangle 200 | float det = dot(e1, pvec); 201 | 202 | if (det < -1e-6f) 203 | return false; 204 | 205 | // calculate distance from vert0 to ray origin point "from" 206 | float3 tvec = from - v1; 207 | 208 | // calculate U parameter and test bounds 209 | float u = dot(tvec, pvec); 210 | 211 | if (u < 0.0f || u > det) 212 | return false; 213 | 214 | // prepare to test V parameter 215 | float3 qvec = cross(tvec, e1); 216 | 217 | // calculate V parameter and test bounds 218 | float v = dot(dir, qvec); 219 | 220 | if (v < 0.0f || u + v > det) 221 | return false; 222 | 223 | // calculate t, scale parameters, ray intersects triangle 224 | float t = dot(e2, qvec) / det; 225 | 226 | if (t > len) 227 | return false; 228 | 229 | out_intersection = from + dir * t; 230 | out_normal = normalize(cross(e1, e2)); 231 | 232 | return true; 233 | } 234 | -------------------------------------------------------------------------------- /Guts/packing.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | #define UF11_M_BITS 6 6 | #define UF11_E_BITS 5 7 | #define UF11_S_MASK 0x0 8 | 9 | #define UF10_M_BITS 5 10 | #define UF10_E_BITS 5 11 | #define UF10_S_MASK 0x0 12 | 13 | namespace Packing { 14 | 15 | template 16 | ML_INLINE uint32_t float4_to_unorm(const float4& v) { 17 | ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32"); 18 | 19 | constexpr uint32_t Rmask = (1 << Rbits) - 1; 20 | constexpr uint32_t Gmask = (1 << Gbits) - 1; 21 | constexpr uint32_t Bmask = (1 << Bbits) - 1; 22 | constexpr uint32_t Amask = (1 << Abits) - 1; 23 | 24 | constexpr uint32_t Gshift = Rbits & 31; 25 | constexpr uint32_t Bshift = (Gshift + Gbits) & 31; 26 | constexpr uint32_t Ashift = (Bshift + Bbits) & 31; 27 | 28 | const v4f scale = v4f_set(float(Rmask), float(Gmask), float(Bmask), float(Amask)); 29 | 30 | v4f t = _mm_mul_ps(v.xmm, scale); 31 | v4i i = _mm_cvtps_epi32(t); 32 | 33 | uint32_t p = _mm_cvtsi128_si32(i); 34 | p |= Gbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << Gshift); 35 | p |= Bbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(2, 2, 2, 2))) << Bshift); 36 | p |= Abits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(3, 3, 3, 3))) << Ashift); 37 | 38 | return p; 39 | } 40 | 41 | template <> 42 | ML_INLINE uint32_t float4_to_unorm<8, 8, 8, 8>(const float4& v) { 43 | v4f t = _mm_mul_ps(v.xmm, _mm_set1_ps(255.0f)); 44 | v4i i = _mm_cvtps_epi32(t); 45 | i = _mm_shuffle_epi8(i, _mm_set1_epi32(0x0C080400)); 46 | 47 | return _mm_cvtsi128_si32(i); 48 | } 49 | 50 | ML_INLINE uint32_t float2_to_unorm_16_16(const float2& v) { 51 | v4f t = v4f_set(v.x, v.y, 0.0f, 0.0f); 52 | t = _mm_mul_ps(t, _mm_set1_ps(65535.0f)); 53 | v4i i = _mm_cvtps_epi32(t); 54 | 55 | uint32_t p = _mm_cvtsi128_si32(i); 56 | p |= _mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << 16; 57 | 58 | return p; 59 | } 60 | 61 | ML_INLINE uint32_t float3_to_ufloat_11_11_10(const float3& v) { 62 | uint32_t r = ToSmallFloat(v.x); 63 | r |= ToSmallFloat(v.y) << 11; 64 | r |= ToSmallFloat(v.z) << 22; 65 | 66 | return r; 67 | } 68 | 69 | template 70 | ML_INLINE uint32_t float4_to_snorm(const float4& v) { 71 | ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32"); 72 | 73 | constexpr uint32_t Rmask = (1 << Rbits) - 1; 74 | constexpr uint32_t Gmask = (1 << Gbits) - 1; 75 | constexpr uint32_t Bmask = (1 << Bbits) - 1; 76 | constexpr uint32_t Amask = (1 << Abits) - 1; 77 | 78 | constexpr uint32_t Gshift = Rbits & 31; 79 | constexpr uint32_t Bshift = (Gshift + Gbits) & 31; 80 | constexpr uint32_t Ashift = (Bshift + Bbits) & 31; 81 | 82 | constexpr float Rrange = (1 << (Rbits == 0 ? 1 : (Rbits - 1))) - 1; 83 | constexpr float Grange = (1 << (Gbits == 0 ? 1 : (Gbits - 1))) - 1; 84 | constexpr float Brange = (1 << (Bbits == 0 ? 1 : (Bbits - 1))) - 1; 85 | constexpr float Arange = (1 << (Abits == 0 ? 1 : (Abits - 1))) - 1; 86 | 87 | const v4f scale = v4f_set(Rrange, Grange, Brange, Arange); 88 | const v4i mask = _mm_setr_epi32(Rmask, Gmask, Bmask, Amask); 89 | 90 | v4f t = _mm_mul_ps(v.xmm, scale); 91 | v4i i = _mm_cvtps_epi32(t); 92 | i = _mm_and_si128(i, mask); 93 | 94 | uint32_t p = _mm_cvtsi128_si32(i); 95 | p |= Gbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << Gshift); 96 | p |= Bbits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(2, 2, 2, 2))) << Bshift); 97 | p |= Abits == 0 ? 0 : (_mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(3, 3, 3, 3))) << Ashift); 98 | 99 | return p; 100 | } 101 | 102 | template <> 103 | ML_INLINE uint32_t float4_to_snorm<8, 8, 8, 8>(const float4& v) { 104 | v4f t = _mm_mul_ps(v.xmm, _mm_set1_ps(127.0f)); 105 | v4i i = _mm_cvtps_epi32(t); 106 | i = _mm_shuffle_epi8(i, _mm_set1_epi32(0x0C080400)); 107 | 108 | return _mm_cvtsi128_si32(i); 109 | } 110 | 111 | ML_INLINE uint32_t float2_to_snorm_16_16(const float2& v) { 112 | v4f t = v4f_set(v.x, v.y, 0.0f, 0.0f); 113 | t = _mm_mul_ps(t, _mm_set1_ps(32767.0f)); 114 | v4i i = _mm_cvtps_epi32(t); 115 | i = _mm_and_si128(i, _mm_setr_epi32(65535, 65535, 0, 0)); 116 | 117 | uint32_t p = _mm_cvtsi128_si32(i); 118 | p |= _mm_cvtsi128_si32(_mm_shuffle_epi32(i, _MM_SHUFFLE(1, 1, 1, 1))) << 16; 119 | 120 | return p; 121 | } 122 | 123 | ML_INLINE float16_t2 float2_to_float16_t2(const float2& v) { 124 | float16_t2 r; 125 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 126 | v4f t = v4f_set(v.x, v.y, 0.0f, 0.0f); 127 | v4i p = v4f_to_h4(t); 128 | 129 | *((int32_t*)&r) = _mm_cvtsi128_si32(p); 130 | #else 131 | r.x = float16_t(v.x); 132 | r.y = float16_t(v.y); 133 | #endif 134 | 135 | return r; 136 | } 137 | 138 | ML_INLINE float16_t4 float4_to_float16_t4(const float4& v) { 139 | float16_t4 r; 140 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 141 | v4i p = v4f_to_h4(v.xmm); 142 | *((int64_t*)&r) = _mm_extract_epi64(p, 0); 143 | #else 144 | float16_t2 xy = float2_to_float16_t2(v.xy); 145 | float16_t2 zw = float2_to_float16_t2(v.zw); 146 | 147 | r = float16_t4(xy, zw); 148 | #endif 149 | 150 | return r; 151 | } 152 | 153 | template 154 | ML_INLINE float4 unorm_to_float4(uint32_t p) { 155 | ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32"); 156 | 157 | constexpr uint32_t Rmask = (1 << Rbits) - 1; 158 | constexpr uint32_t Gmask = (1 << Gbits) - 1; 159 | constexpr uint32_t Bmask = (1 << Bbits) - 1; 160 | constexpr uint32_t Amask = (1 << Abits) - 1; 161 | 162 | constexpr uint32_t Gshift = Rbits & 31; 163 | constexpr uint32_t Bshift = (Gshift + Gbits) & 31; 164 | constexpr uint32_t Ashift = (Bshift + Bbits) & 31; 165 | 166 | constexpr float invRmask = Rmask == 0 ? 1.0f : 1.0f / Rmask; 167 | constexpr float invGmask = Gmask == 0 ? 1.0f : 1.0f / Gmask; 168 | constexpr float invBmask = Bmask == 0 ? 1.0f : 1.0f / Bmask; 169 | constexpr float invAmask = Amask == 0 ? 1.0f : 1.0f / Amask; 170 | 171 | const v4f scale = v4f_set(invRmask, invGmask, invBmask, invAmask); 172 | 173 | v4i i = _mm_setr_epi32(p & Rmask, (p >> Gshift) & Gmask, (p >> Bshift) & Bmask, (p >> Ashift) & Amask); 174 | v4f t = _mm_cvtepi32_ps(i); 175 | t = _mm_mul_ps(t, scale); 176 | 177 | return t; 178 | } 179 | 180 | template <> 181 | ML_INLINE float4 unorm_to_float4<8, 8, 8, 8>(uint32_t p) { 182 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_SSE4) 183 | v4i i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p)); 184 | #else 185 | v4i i = _mm_set_epi32(p >> 24, (p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF); 186 | #endif 187 | 188 | v4f t = _mm_cvtepi32_ps(i); 189 | t = _mm_mul_ps(t, _mm_set1_ps(1.0f / 255.0f)); 190 | 191 | return t; 192 | } 193 | 194 | ML_INLINE float3 ufloat_11_11_10_to_float3(uint32_t p) { 195 | float3 v; 196 | v.x = FromSmallFloat(p & ((1 << 11) - 1)); 197 | v.y = FromSmallFloat((p >> 11) & ((1 << 11) - 1)); 198 | v.z = FromSmallFloat((p >> 22) & ((1 << 10) - 1)); 199 | 200 | return v; 201 | } 202 | 203 | template 204 | ML_INLINE float4 snorm_to_float4(uint32_t p) { 205 | ML_StaticAssertMsg(Rbits + Gbits + Bbits + Abits <= 32, "Sum of all bit must be <= 32"); 206 | 207 | constexpr uint32_t Rmask = (1 << Rbits) - 1; 208 | constexpr uint32_t Gmask = (1 << Gbits) - 1; 209 | constexpr uint32_t Bmask = (1 << Bbits) - 1; 210 | constexpr uint32_t Amask = (1 << Abits) - 1; 211 | 212 | constexpr uint32_t Gshift = Rbits & 31; 213 | constexpr uint32_t Bshift = (Gshift + Gbits) & 31; 214 | constexpr uint32_t Ashift = (Bshift + Bbits) & 31; 215 | 216 | constexpr uint32_t Rsign = 1 << (Rbits == 0 ? 0 : (Rbits - 1)); 217 | constexpr uint32_t Gsign = 1 << (Gbits == 0 ? 0 : (Gbits - 1)); 218 | constexpr uint32_t Bsign = 1 << (Bbits == 0 ? 0 : (Bbits - 1)); 219 | constexpr uint32_t Asign = 1 << (Abits == 0 ? 0 : (Abits - 1)); 220 | 221 | constexpr float invRsignMinus1 = Rbits == 0 ? 1.0f : 1.0f / (Rsign - 1); 222 | constexpr float invGsignMinus1 = Gbits == 0 ? 1.0f : 1.0f / (Gsign - 1); 223 | constexpr float invBsignMinus1 = Bbits == 0 ? 1.0f : 1.0f / (Bsign - 1); 224 | constexpr float invAsignMinus1 = Abits == 0 ? 1.0f : 1.0f / (Asign - 1); 225 | 226 | const v4i vsign = _mm_setr_epi32(Rsign, Gsign, Bsign, Asign); 227 | const v4i vor = _mm_setr_epi32(~(Rsign - 1), ~(Gsign - 1), ~(Bsign - 1), ~(Asign - 1)); 228 | const v4f vscale = v4f_set(invRsignMinus1, invGsignMinus1, invBsignMinus1, invAsignMinus1); 229 | 230 | v4i i = _mm_setr_epi32(p & Rmask, (p >> Gshift) & Gmask, (p >> Bshift) & Bmask, (p >> Ashift) & Amask); 231 | 232 | v4i mask = _mm_and_si128(i, vsign); 233 | v4i ii = _mm_or_si128(i, vor); 234 | i = v4i_select(i, ii, _mm_cmpeq_epi32(mask, _mm_setzero_si128())); 235 | 236 | v4f t = _mm_cvtepi32_ps(i); 237 | t = _mm_mul_ps(t, vscale); 238 | t = _mm_max_ps(t, _mm_set1_ps(-1.0f)); 239 | 240 | return t; 241 | } 242 | 243 | template <> 244 | ML_INLINE float4 snorm_to_float4<8, 8, 8, 8>(uint32_t p) { 245 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_SSE4) 246 | v4i i = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(p)); 247 | #else 248 | v4i i = _mm_set_epi32(int8_t(p >> 24), int8_t((p >> 16) & 0xFF), int8_t((p >> 8) & 0xFF), int8_t(p & 0xFF)); 249 | #endif 250 | 251 | v4f t = _mm_cvtepi32_ps(i); 252 | t = _mm_mul_ps(t, _mm_set1_ps(1.0f / 127.0f)); 253 | t = _mm_max_ps(t, _mm_set1_ps(-1.0f)); 254 | 255 | return t; 256 | } 257 | 258 | ML_INLINE float2 float16_t2_to_float2(const float16_t2& p) { 259 | float2 r; 260 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 261 | v4i t = _mm_cvtsi32_si128(*(int32_t*)&p); 262 | v4f f = _mm_cvtph_ps(t); 263 | 264 | _mm_storel_pi((__m64*)&r.mm, f); 265 | #else 266 | r.x = float(p.x); 267 | r.y = float(p.y); 268 | #endif 269 | 270 | return r; 271 | } 272 | 273 | ML_INLINE float4 float16_t4_to_float4(const float16_t4& p) { 274 | float4 f; 275 | #if (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 276 | v4i t = _mm_loadu_si64(&p); 277 | f.xmm = _mm_cvtph_ps(t); 278 | #else 279 | f.x = float(p.x); 280 | f.y = float(p.y); 281 | f.z = float(p.z); 282 | f.w = float(p.w); 283 | #endif 284 | 285 | return f; 286 | } 287 | 288 | } // namespace Packing 289 | -------------------------------------------------------------------------------- /Guts/sorting.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | // NOTE: returns p1 < p2 ? -1 : (p1 > p2 ? 1 : 0) 6 | 7 | typedef int32_t (*pfn_cmp_qsort)(const void* p1, const void* p2); 8 | 9 | // NOTE: true - swap, false - keep; a - left, b - right 10 | 11 | template 12 | inline bool Sort_default_less(const T& a, const T& b) { 13 | return a < b; 14 | } 15 | 16 | template 17 | inline bool Sort_default_greater(const T& a, const T& b) { 18 | return a > b; 19 | } 20 | 21 | /* 22 | bool Sort_multikey(const T& a, const T& b) 23 | { 24 | if( a.property1 > b.property1 ) 25 | return true; 26 | 27 | if( a.property1 == b.property1 ) 28 | { 29 | if( a.property2 > b.property2 ) 30 | return true; 31 | 32 | if( a.property2 == b.property2 ) 33 | return a.property3 > b.property3; 34 | } 35 | 36 | return false; 37 | } 38 | */ 39 | 40 | // NOTE: heap sort 41 | // memory: O(1) 42 | // random: +40% vs qsort 43 | // sorted: -30% vs qsort 44 | // reversed: -30% vs qsort 45 | 46 | template 47 | void Sort_heap(T* a, uint32_t n) { 48 | if (n < 2) 49 | return; 50 | 51 | uint32_t i = n >> 1; 52 | 53 | for (;;) { 54 | T t; 55 | 56 | if (i > 0) 57 | t = a[--i]; 58 | else { 59 | if (--n == 0) 60 | return; 61 | 62 | t = a[n]; 63 | a[n] = a[0]; 64 | } 65 | 66 | uint32_t parent = i; 67 | uint32_t child = (i << 1) + 1; 68 | 69 | while (child < n) { 70 | if (child + 1 < n && cmp(a[child], a[child + 1])) 71 | child++; 72 | 73 | if (cmp(t, a[child])) { 74 | a[parent] = a[child]; 75 | 76 | parent = child; 77 | child = (parent << 1) + 1; 78 | } else 79 | break; 80 | } 81 | 82 | a[parent] = t; 83 | } 84 | } 85 | 86 | // NOTE: merge sort 87 | // memory: O(n), t - temp array, return pointer to sorted array (can be a or t) 88 | // random: +130% vs qsort 89 | // sorted: +35% vs qsort 90 | // reversed: +40% vs qsort 91 | 92 | template 93 | T* Sort_merge(T* t, T* a, uint32_t n) { 94 | if (n < 2) 95 | return a; 96 | 97 | uint32_t n2 = n << 1; 98 | 99 | for (uint32_t size = 2; size < n2; size <<= 1) { 100 | T* tmp = t; 101 | 102 | for (uint32_t i = 0; i < n; i += size) { 103 | uint32_t j = i; 104 | uint32_t nj = i + (size >> 1); 105 | 106 | if (nj > n) 107 | nj = n; 108 | 109 | uint32_t k = nj; 110 | uint32_t nk = i + size; 111 | 112 | if (nk > n) 113 | nk = n; 114 | 115 | while (j < nj && k < nk) 116 | *tmp++ = cmp(a[j], a[k]) ? a[j++] : a[k++]; 117 | 118 | nj -= j; 119 | nk -= k; 120 | 121 | if (nj) { 122 | memcpy(tmp, a + j, nj * sizeof(T)); 123 | tmp += nj; 124 | } 125 | 126 | if (nk) { 127 | memcpy(tmp, a + k, nk * sizeof(T)); 128 | tmp += nk; 129 | } 130 | } 131 | 132 | tmp = a; 133 | a = t; 134 | t = tmp; 135 | } 136 | 137 | return a; 138 | } 139 | -------------------------------------------------------------------------------- /Guts/swizzle.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | #define ML_X 0 6 | #define ML_Y 1 7 | #define ML_Z 2 8 | #define ML_W 3 9 | 10 | template 11 | class swizzle { 12 | private: 13 | // Based on: https://kiorisyshen.github.io/2018/08/27/Vector%20Swizzling%20and%20Parameter%20Pack%20in%20C++/ 14 | T a[sizeof...(Indices)]; 15 | 16 | public: 17 | static constexpr uint32_t i[] = {Indices...}; 18 | static constexpr size_t N = sizeof...(Indices); 19 | 20 | ML_INLINE void operator=(const C& rhs) { 21 | for (size_t n = 0; n < N; n++) 22 | a[i[n]] = rhs[n]; 23 | } 24 | 25 | ML_INLINE operator C() const { 26 | return C(a[Indices]...); 27 | } 28 | }; 29 | 30 | // Swizzle ops 31 | 32 | #define ML_SWIZZLE_2_OP(op, f, swizzle) \ 33 | ML_INLINE void operator op(const C& v) { \ 34 | ML_StaticAssertMsg(X != Y, "Wrong swizzle in " ML_Stringify(op)); \ 35 | a[X] op v.x; \ 36 | a[Y] op v.y; \ 37 | } 38 | 39 | #define ML_SWIZZLE_3_OP(op, f, swizzle) \ 40 | ML_INLINE void operator op(const C& v) { \ 41 | ML_StaticAssertMsg(X != Y && Y != Z && Z != X, "Wrong swizzle in " ML_Stringify(op)); \ 42 | a[X] op v.x; \ 43 | a[Y] op v.y; \ 44 | a[Z] op v.z; \ 45 | } 46 | 47 | #if 0 48 | # define ML_SWIZZLE_4_OP(op, f, swizzle) \ 49 | ML_INLINE void operator op(const C& v) { \ 50 | ML_StaticAssertMsg(X + Y + Z + W == 6, "Wrong swizzle in " ML_Stringify(op)); \ 51 | a[X] op v.x; \ 52 | a[Y] op v.y; \ 53 | a[Z] op v.z; \ 54 | a[W] op v.w; \ 55 | } 56 | #else 57 | # define ML_SWIZZLE_4_OP(op, f, swizzle) \ 58 | ML_INLINE void operator op(const C& v) { \ 59 | ML_StaticAssertMsg(X + Y + Z + W == 6, "Wrong swizzle in " ML_Stringify(op)); \ 60 | vec = f(swizzle(vec, X, Y, Z, W), v); \ 61 | } 62 | #endif 63 | 64 | // v4i 65 | 66 | template 67 | class v4i_swizzle2 { 68 | private: 69 | union { 70 | struct { 71 | v4i vec; 72 | }; 73 | 74 | struct { 75 | int32_t a[COORD_4D]; 76 | }; 77 | }; 78 | 79 | public: 80 | // Read-only: fast 81 | ML_INLINE operator C() const { 82 | return C(a[X], a[Y]); 83 | } 84 | 85 | // Read-write: most likely slow 86 | ML_SWIZZLE_2_OP(=, _mm_copy, v4i_swizzle) 87 | ML_SWIZZLE_2_OP(-=, _mm_sub_epi32, v4i_swizzle) 88 | ML_SWIZZLE_2_OP(+=, _mm_add_epi32, v4i_swizzle) 89 | ML_SWIZZLE_2_OP(*=, _mm_mullo_epi32, v4i_swizzle) 90 | ML_SWIZZLE_2_OP(/=, _mm_div_epi32, v4i_swizzle) 91 | ML_SWIZZLE_2_OP(%=, v4i_mod, v4i_swizzle) 92 | ML_SWIZZLE_2_OP(<<=, _mm_sllv_epi32, v4i_swizzle) 93 | ML_SWIZZLE_2_OP(>>=, _mm_srlv_epi32, v4i_swizzle) 94 | ML_SWIZZLE_2_OP(&=, _mm_and_si128, v4i_swizzle) 95 | ML_SWIZZLE_2_OP(|=, _mm_or_si128, v4i_swizzle) 96 | ML_SWIZZLE_2_OP(^=, _mm_xor_si128, v4i_swizzle) 97 | }; 98 | 99 | template 100 | class v4i_swizzle3 { 101 | private: 102 | union { 103 | struct { 104 | v4i vec; 105 | }; 106 | 107 | struct { 108 | int32_t a[COORD_4D]; 109 | }; 110 | }; 111 | 112 | public: 113 | // Read-only: fast 114 | ML_INLINE operator C() const { 115 | return v4i_swizzle(vec, X, Y, Z, 3); 116 | } 117 | 118 | // Read-write: most likely slow 119 | ML_SWIZZLE_3_OP(=, _mm_copy, v4i_swizzle) 120 | ML_SWIZZLE_3_OP(-=, _mm_sub_epi32, v4i_swizzle) 121 | ML_SWIZZLE_3_OP(+=, _mm_add_epi32, v4i_swizzle) 122 | ML_SWIZZLE_3_OP(*=, _mm_mullo_epi32, v4i_swizzle) 123 | ML_SWIZZLE_3_OP(/=, _mm_div_epi32, v4i_swizzle) 124 | ML_SWIZZLE_3_OP(%=, v4i_mod, v4i_swizzle) 125 | ML_SWIZZLE_3_OP(<<=, _mm_sllv_epi32, v4i_swizzle) 126 | ML_SWIZZLE_3_OP(>>=, _mm_srlv_epi32, v4i_swizzle) 127 | ML_SWIZZLE_3_OP(&=, _mm_and_si128, v4i_swizzle) 128 | ML_SWIZZLE_3_OP(|=, _mm_or_si128, v4i_swizzle) 129 | ML_SWIZZLE_3_OP(^=, _mm_xor_si128, v4i_swizzle) 130 | }; 131 | 132 | template 133 | class v4i_swizzle4 { 134 | private: 135 | union { 136 | struct { 137 | v4i vec; 138 | }; 139 | 140 | struct { 141 | int32_t a[COORD_4D]; 142 | }; 143 | }; 144 | 145 | public: 146 | // Read-only: fast 147 | ML_INLINE operator C() const { 148 | return v4i_swizzle(vec, X, Y, Z, W); 149 | } 150 | 151 | // Read-write: most likely slow 152 | ML_SWIZZLE_4_OP(=, _mm_copy, v4i_swizzle) 153 | ML_SWIZZLE_4_OP(-=, _mm_sub_epi32, v4i_swizzle) 154 | ML_SWIZZLE_4_OP(+=, _mm_add_epi32, v4i_swizzle) 155 | ML_SWIZZLE_4_OP(*=, _mm_mullo_epi32, v4i_swizzle) 156 | ML_SWIZZLE_4_OP(/=, _mm_div_epi32, v4i_swizzle) 157 | ML_SWIZZLE_4_OP(%=, v4i_mod, v4i_swizzle) 158 | ML_SWIZZLE_4_OP(<<=, _mm_sllv_epi32, v4i_swizzle) 159 | ML_SWIZZLE_4_OP(>>=, _mm_srlv_epi32, v4i_swizzle) 160 | ML_SWIZZLE_4_OP(&=, _mm_and_si128, v4i_swizzle) 161 | ML_SWIZZLE_4_OP(|=, _mm_or_si128, v4i_swizzle) 162 | ML_SWIZZLE_4_OP(^=, _mm_xor_si128, v4i_swizzle) 163 | }; 164 | 165 | // v4u 166 | 167 | template 168 | class v4u_swizzle2 { 169 | private: 170 | union { 171 | struct { 172 | v4i vec; 173 | }; 174 | 175 | struct { 176 | uint32_t a[COORD_4D]; 177 | }; 178 | }; 179 | 180 | public: 181 | // Read-only: fast 182 | ML_INLINE operator C() const { 183 | return C(a[X], a[Y]); 184 | } 185 | 186 | // Read-write: most likely slow 187 | ML_SWIZZLE_2_OP(=, _mm_copy, v4i_swizzle) 188 | ML_SWIZZLE_2_OP(-=, _mm_sub_epi32, v4i_swizzle) 189 | ML_SWIZZLE_2_OP(+=, _mm_add_epi32, v4i_swizzle) 190 | ML_SWIZZLE_2_OP(*=, _mm_mullo_epi32, v4i_swizzle) 191 | ML_SWIZZLE_2_OP(/=, _mm_div_epu32, v4i_swizzle) 192 | ML_SWIZZLE_2_OP(%=, v4u_mod, v4i_swizzle) 193 | ML_SWIZZLE_2_OP(<<=, _mm_sllv_epi32, v4i_swizzle) 194 | ML_SWIZZLE_2_OP(>>=, _mm_srlv_epi32, v4i_swizzle) 195 | ML_SWIZZLE_2_OP(&=, _mm_and_si128, v4i_swizzle) 196 | ML_SWIZZLE_2_OP(|=, _mm_or_si128, v4i_swizzle) 197 | ML_SWIZZLE_2_OP(^=, _mm_xor_si128, v4i_swizzle) 198 | }; 199 | 200 | template 201 | class v4u_swizzle3 { 202 | private: 203 | union { 204 | struct { 205 | v4i vec; 206 | }; 207 | 208 | struct { 209 | uint32_t a[COORD_4D]; 210 | }; 211 | }; 212 | 213 | public: 214 | // Read-only: fast 215 | ML_INLINE operator C() const { 216 | return v4i_swizzle(vec, X, Y, Z, 3); 217 | } 218 | 219 | // Read-write: most likely slow 220 | ML_SWIZZLE_3_OP(=, _mm_copy, v4i_swizzle) 221 | ML_SWIZZLE_3_OP(-=, _mm_sub_epi32, v4i_swizzle) 222 | ML_SWIZZLE_3_OP(+=, _mm_add_epi32, v4i_swizzle) 223 | ML_SWIZZLE_3_OP(*=, _mm_mullo_epi32, v4i_swizzle) 224 | ML_SWIZZLE_3_OP(/=, _mm_div_epu32, v4i_swizzle) 225 | ML_SWIZZLE_3_OP(%=, v4u_mod, v4i_swizzle) 226 | ML_SWIZZLE_3_OP(<<=, _mm_sllv_epi32, v4i_swizzle) 227 | ML_SWIZZLE_3_OP(>>=, _mm_srlv_epi32, v4i_swizzle) 228 | ML_SWIZZLE_3_OP(&=, _mm_and_si128, v4i_swizzle) 229 | ML_SWIZZLE_3_OP(|=, _mm_or_si128, v4i_swizzle) 230 | ML_SWIZZLE_3_OP(^=, _mm_xor_si128, v4i_swizzle) 231 | }; 232 | 233 | template 234 | class v4u_swizzle4 { 235 | private: 236 | union { 237 | struct { 238 | v4i vec; 239 | }; 240 | 241 | struct { 242 | uint32_t a[COORD_4D]; 243 | }; 244 | }; 245 | 246 | public: 247 | // Read-only: fast 248 | ML_INLINE operator C() const { 249 | return v4i_swizzle(vec, X, Y, Z, W); 250 | } 251 | 252 | // Read-write: most likely slow 253 | ML_SWIZZLE_4_OP(=, _mm_copy, v4i_swizzle) 254 | ML_SWIZZLE_4_OP(-=, _mm_sub_epi32, v4i_swizzle) 255 | ML_SWIZZLE_4_OP(+=, _mm_add_epi32, v4i_swizzle) 256 | ML_SWIZZLE_4_OP(*=, _mm_mullo_epi32, v4i_swizzle) 257 | ML_SWIZZLE_4_OP(/=, _mm_div_epu32, v4i_swizzle) 258 | ML_SWIZZLE_4_OP(%=, v4u_mod, v4i_swizzle) 259 | ML_SWIZZLE_4_OP(<<=, _mm_sllv_epi32, v4i_swizzle) 260 | ML_SWIZZLE_4_OP(>>=, _mm_srlv_epi32, v4i_swizzle) 261 | ML_SWIZZLE_4_OP(&=, _mm_and_si128, v4i_swizzle) 262 | ML_SWIZZLE_4_OP(|=, _mm_or_si128, v4i_swizzle) 263 | ML_SWIZZLE_4_OP(^=, _mm_xor_si128, v4i_swizzle) 264 | }; 265 | 266 | // v4f 267 | 268 | template 269 | class v4f_swizzle2 { 270 | private: 271 | union { 272 | struct { 273 | v4f vec; 274 | }; 275 | 276 | struct { 277 | float a[COORD_4D]; 278 | }; 279 | }; 280 | 281 | public: 282 | // Read-only: fast 283 | ML_INLINE operator C() const { 284 | return C(a[X], a[Y]); 285 | } 286 | 287 | // Read-write: most likely slow 288 | ML_SWIZZLE_2_OP(=, _mm_copy, v4f_swizzle) 289 | ML_SWIZZLE_2_OP(-=, _mm_sub_ps, v4f_swizzle) 290 | ML_SWIZZLE_2_OP(+=, _mm_add_ps, v4f_swizzle) 291 | ML_SWIZZLE_2_OP(*=, _mm_mul_ps, v4f_swizzle) 292 | ML_SWIZZLE_2_OP(/=, _mm_div_ps, v4f_swizzle) 293 | }; 294 | 295 | template 296 | class v4f_swizzle3 { 297 | private: 298 | union { 299 | struct { 300 | v4f vec; 301 | }; 302 | 303 | struct { 304 | float a[COORD_4D]; 305 | }; 306 | }; 307 | 308 | public: 309 | // Read-only: fast 310 | ML_INLINE operator C() const { 311 | return v4f_swizzle(vec, X, Y, Z, 3); 312 | } 313 | 314 | // Read-write: most likely slow 315 | ML_SWIZZLE_3_OP(=, _mm_copy, v4f_swizzle) 316 | ML_SWIZZLE_3_OP(-=, _mm_sub_ps, v4f_swizzle) 317 | ML_SWIZZLE_3_OP(+=, _mm_add_ps, v4f_swizzle) 318 | ML_SWIZZLE_3_OP(*=, _mm_mul_ps, v4f_swizzle) 319 | ML_SWIZZLE_3_OP(/=, _mm_div_ps, v4f_swizzle) 320 | }; 321 | 322 | template 323 | class v4f_swizzle4 { 324 | private: 325 | union { 326 | struct { 327 | v4f vec; 328 | }; 329 | 330 | struct { 331 | float a[COORD_4D]; 332 | }; 333 | }; 334 | 335 | public: 336 | // Read-only: fast 337 | ML_INLINE operator C() const { 338 | return v4f_swizzle(vec, X, Y, Z, W); 339 | } 340 | 341 | // Read-write: most likely slow 342 | ML_SWIZZLE_4_OP(=, _mm_copy, v4f_swizzle) 343 | ML_SWIZZLE_4_OP(-=, _mm_sub_ps, v4f_swizzle) 344 | ML_SWIZZLE_4_OP(+=, _mm_add_ps, v4f_swizzle) 345 | ML_SWIZZLE_4_OP(*=, _mm_mul_ps, v4f_swizzle) 346 | ML_SWIZZLE_4_OP(/=, _mm_div_ps, v4f_swizzle) 347 | }; 348 | 349 | // v4d 350 | 351 | template 352 | class v4d_swizzle2 { 353 | private: 354 | union { 355 | struct { 356 | v4d vec; 357 | }; 358 | 359 | struct { 360 | double a[COORD_4D]; 361 | }; 362 | }; 363 | 364 | public: 365 | // Read-only: fast 366 | ML_INLINE operator C() const { 367 | return C(a[X], a[Y]); 368 | } 369 | 370 | // Read-write: most likely slow 371 | ML_SWIZZLE_2_OP(=, _mm_copy, v4d_swizzle) 372 | ML_SWIZZLE_2_OP(-=, _mm256_sub_pd, v4d_swizzle) 373 | ML_SWIZZLE_2_OP(+=, _mm256_add_pd, v4d_swizzle) 374 | ML_SWIZZLE_2_OP(*=, _mm256_mul_pd, v4d_swizzle) 375 | ML_SWIZZLE_2_OP(/=, _mm256_div_pd, v4d_swizzle) 376 | }; 377 | 378 | template 379 | class v4d_swizzle3 { 380 | private: 381 | union { 382 | struct { 383 | v4d vec; 384 | }; 385 | 386 | struct { 387 | double a[COORD_4D]; 388 | }; 389 | }; 390 | 391 | public: 392 | // Read-only: fast 393 | ML_INLINE operator C() const { 394 | return v4d_swizzle(vec, X, Y, Z, 3); 395 | } 396 | 397 | // Read-write: most likely slow 398 | ML_SWIZZLE_3_OP(=, _mm_copy, v4d_swizzle) 399 | ML_SWIZZLE_3_OP(-=, _mm256_sub_pd, v4d_swizzle) 400 | ML_SWIZZLE_3_OP(+=, _mm256_add_pd, v4d_swizzle) 401 | ML_SWIZZLE_3_OP(*=, _mm256_mul_pd, v4d_swizzle) 402 | ML_SWIZZLE_3_OP(/=, _mm256_div_pd, v4d_swizzle) 403 | }; 404 | 405 | template 406 | class v4d_swizzle4 { 407 | private: 408 | union { 409 | struct { 410 | v4d vec; 411 | }; 412 | 413 | struct { 414 | double a[COORD_4D]; 415 | }; 416 | }; 417 | 418 | public: 419 | // Read-only: fast 420 | ML_INLINE operator C() const { 421 | return v4d_swizzle(vec, X, Y, Z, W); 422 | } 423 | 424 | // Read-write: most likely slow 425 | ML_SWIZZLE_4_OP(=, _mm_copy, v4d_swizzle) 426 | ML_SWIZZLE_4_OP(-=, _mm256_sub_pd, v4d_swizzle) 427 | ML_SWIZZLE_4_OP(+=, _mm256_add_pd, v4d_swizzle) 428 | ML_SWIZZLE_4_OP(*=, _mm256_mul_pd, v4d_swizzle) 429 | ML_SWIZZLE_4_OP(/=, _mm256_div_pd, v4d_swizzle) 430 | }; 431 | 432 | #undef ML_SWIZZLE_2_OP 433 | #undef ML_SWIZZLE_3_OP 434 | #undef ML_SWIZZLE_4_OP 435 | 436 | // swizzles 437 | 438 | #define ML_SWIZZLE_2(C, T) \ 439 | swizzle xx; \ 440 | swizzle xy; \ 441 | swizzle yx; \ 442 | swizzle yy 443 | 444 | #define ML_SWIZZLE_3(S2, C2, S3, C3) \ 445 | S2 xx; \ 446 | S2 xy; \ 447 | S2 xz; \ 448 | S2 yx; \ 449 | S2 yy; \ 450 | S2 yz; \ 451 | S2 zx; \ 452 | S2 zy; \ 453 | S2 zz; \ 454 | S3 xxx; \ 455 | S3 xxy; \ 456 | S3 xxz; \ 457 | S3 xyx; \ 458 | S3 xyy; \ 459 | S3 xyz; \ 460 | S3 xzx; \ 461 | S3 xzy; \ 462 | S3 xzz; \ 463 | S3 yxx; \ 464 | S3 yxy; \ 465 | S3 yxz; \ 466 | S3 yyx; \ 467 | S3 yyy; \ 468 | S3 yyz; \ 469 | S3 yzx; \ 470 | S3 yzy; \ 471 | S3 yzz; \ 472 | S3 zxx; \ 473 | S3 zxy; \ 474 | S3 zxz; \ 475 | S3 zyx; \ 476 | S3 zyy; \ 477 | S3 zyz; \ 478 | S3 zzx; \ 479 | S3 zzy; \ 480 | S3 zzz 481 | 482 | #define ML_SWIZZLE_4(S2, C2, S3, C3, S4, C4) \ 483 | S2 xx; \ 484 | S2 xy; \ 485 | S2 xz; \ 486 | S2 xw; \ 487 | S2 yx; \ 488 | S2 yy; \ 489 | S2 yz; \ 490 | S2 yw; \ 491 | S2 zx; \ 492 | S2 zy; \ 493 | S2 zz; \ 494 | S2 zw; \ 495 | S2 wx; \ 496 | S2 wy; \ 497 | S2 wz; \ 498 | S2 ww; \ 499 | S3 xxx; \ 500 | S3 xxy; \ 501 | S3 xxz; \ 502 | S3 xxw; \ 503 | S3 xyx; \ 504 | S3 xyy; \ 505 | S3 xyz; \ 506 | S3 xyw; \ 507 | S3 xzx; \ 508 | S3 xzy; \ 509 | S3 xzz; \ 510 | S3 xzw; \ 511 | S3 xwx; \ 512 | S3 xwy; \ 513 | S3 xwz; \ 514 | S3 xww; \ 515 | S3 yxx; \ 516 | S3 yxy; \ 517 | S3 yxz; \ 518 | S3 yxw; \ 519 | S3 yyx; \ 520 | S3 yyy; \ 521 | S3 yyz; \ 522 | S3 yyw; \ 523 | S3 yzx; \ 524 | S3 yzy; \ 525 | S3 yzz; \ 526 | S3 yzw; \ 527 | S3 ywx; \ 528 | S3 ywy; \ 529 | S3 ywz; \ 530 | S3 yww; \ 531 | S3 zxx; \ 532 | S3 zxy; \ 533 | S3 zxz; \ 534 | S3 zxw; \ 535 | S3 zyx; \ 536 | S3 zyy; \ 537 | S3 zyz; \ 538 | S3 zyw; \ 539 | S3 zzx; \ 540 | S3 zzy; \ 541 | S3 zzz; \ 542 | S3 zzw; \ 543 | S3 zwx; \ 544 | S3 zwy; \ 545 | S3 zwz; \ 546 | S3 zww; \ 547 | S3 wxx; \ 548 | S3 wxy; \ 549 | S3 wxz; \ 550 | S3 wxw; \ 551 | S3 wyx; \ 552 | S3 wyy; \ 553 | S3 wyz; \ 554 | S3 wyw; \ 555 | S3 wzx; \ 556 | S3 wzy; \ 557 | S3 wzz; \ 558 | S3 wzw; \ 559 | S3 wwx; \ 560 | S3 wwy; \ 561 | S3 wwz; \ 562 | S3 www; \ 563 | S4 xxxx; \ 564 | S4 xxxy; \ 565 | S4 xxxz; \ 566 | S4 xxxw; \ 567 | S4 xxyx; \ 568 | S4 xxyy; \ 569 | S4 xxyz; \ 570 | S4 xxyw; \ 571 | S4 xxzx; \ 572 | S4 xxzy; \ 573 | S4 xxzz; \ 574 | S4 xxzw; \ 575 | S4 xxwx; \ 576 | S4 xxwy; \ 577 | S4 xxwz; \ 578 | S4 xxww; \ 579 | S4 xyxx; \ 580 | S4 xyxy; \ 581 | S4 xyxz; \ 582 | S4 xyxw; \ 583 | S4 xyyx; \ 584 | S4 xyyy; \ 585 | S4 xyyz; \ 586 | S4 xyyw; \ 587 | S4 xyzx; \ 588 | S4 xyzy; \ 589 | S4 xyzz; \ 590 | S4 xyzw; \ 591 | S4 xywx; \ 592 | S4 xywy; \ 593 | S4 xywz; \ 594 | S4 xyww; \ 595 | S4 xzxx; \ 596 | S4 xzxy; \ 597 | S4 xzxz; \ 598 | S4 xzxw; \ 599 | S4 xzyx; \ 600 | S4 xzyy; \ 601 | S4 xzyz; \ 602 | S4 xzyw; \ 603 | S4 xzzx; \ 604 | S4 xzzy; \ 605 | S4 xzzz; \ 606 | S4 xzzw; \ 607 | S4 xzwx; \ 608 | S4 xzwy; \ 609 | S4 xzwz; \ 610 | S4 xzww; \ 611 | S4 xwxx; \ 612 | S4 xwxy; \ 613 | S4 xwxz; \ 614 | S4 xwxw; \ 615 | S4 xwyx; \ 616 | S4 xwyy; \ 617 | S4 xwyz; \ 618 | S4 xwyw; \ 619 | S4 xwzx; \ 620 | S4 xwzy; \ 621 | S4 xwzz; \ 622 | S4 xwzw; \ 623 | S4 xwwx; \ 624 | S4 xwwy; \ 625 | S4 xwwz; \ 626 | S4 xwww; \ 627 | S4 yxxx; \ 628 | S4 yxxy; \ 629 | S4 yxxz; \ 630 | S4 yxxw; \ 631 | S4 yxyx; \ 632 | S4 yxyy; \ 633 | S4 yxyz; \ 634 | S4 yxyw; \ 635 | S4 yxzx; \ 636 | S4 yxzy; \ 637 | S4 yxzz; \ 638 | S4 yxzw; \ 639 | S4 yxwx; \ 640 | S4 yxwy; \ 641 | S4 yxwz; \ 642 | S4 yxww; \ 643 | S4 yyxx; \ 644 | S4 yyxy; \ 645 | S4 yyxz; \ 646 | S4 yyxw; \ 647 | S4 yyyx; \ 648 | S4 yyyy; \ 649 | S4 yyyz; \ 650 | S4 yyyw; \ 651 | S4 yyzx; \ 652 | S4 yyzy; \ 653 | S4 yyzz; \ 654 | S4 yyzw; \ 655 | S4 yywx; \ 656 | S4 yywy; \ 657 | S4 yywz; \ 658 | S4 yyww; \ 659 | S4 yzxx; \ 660 | S4 yzxy; \ 661 | S4 yzxz; \ 662 | S4 yzxw; \ 663 | S4 yzyx; \ 664 | S4 yzyy; \ 665 | S4 yzyz; \ 666 | S4 yzyw; \ 667 | S4 yzzx; \ 668 | S4 yzzy; \ 669 | S4 yzzz; \ 670 | S4 yzzw; \ 671 | S4 yzwx; \ 672 | S4 yzwy; \ 673 | S4 yzwz; \ 674 | S4 yzww; \ 675 | S4 ywxx; \ 676 | S4 ywxy; \ 677 | S4 ywxz; \ 678 | S4 ywxw; \ 679 | S4 ywyx; \ 680 | S4 ywyy; \ 681 | S4 ywyz; \ 682 | S4 ywyw; \ 683 | S4 ywzx; \ 684 | S4 ywzy; \ 685 | S4 ywzz; \ 686 | S4 ywzw; \ 687 | S4 ywwx; \ 688 | S4 ywwy; \ 689 | S4 ywwz; \ 690 | S4 ywww; \ 691 | S4 zxxx; \ 692 | S4 zxxy; \ 693 | S4 zxxz; \ 694 | S4 zxxw; \ 695 | S4 zxyx; \ 696 | S4 zxyy; \ 697 | S4 zxyz; \ 698 | S4 zxyw; \ 699 | S4 zxzx; \ 700 | S4 zxzy; \ 701 | S4 zxzz; \ 702 | S4 zxzw; \ 703 | S4 zxwx; \ 704 | S4 zxwy; \ 705 | S4 zxwz; \ 706 | S4 zxww; \ 707 | S4 zyxx; \ 708 | S4 zyxy; \ 709 | S4 zyxz; \ 710 | S4 zyxw; \ 711 | S4 zyyx; \ 712 | S4 zyyy; \ 713 | S4 zyyz; \ 714 | S4 zyyw; \ 715 | S4 zyzx; \ 716 | S4 zyzy; \ 717 | S4 zyzz; \ 718 | S4 zyzw; \ 719 | S4 zywx; \ 720 | S4 zywy; \ 721 | S4 zywz; \ 722 | S4 zyww; \ 723 | S4 zzxx; \ 724 | S4 zzxy; \ 725 | S4 zzxz; \ 726 | S4 zzxw; \ 727 | S4 zzyx; \ 728 | S4 zzyy; \ 729 | S4 zzyz; \ 730 | S4 zzyw; \ 731 | S4 zzzx; \ 732 | S4 zzzy; \ 733 | S4 zzzz; \ 734 | S4 zzzw; \ 735 | S4 zzwx; \ 736 | S4 zzwy; \ 737 | S4 zzwz; \ 738 | S4 zzww; \ 739 | S4 zwxx; \ 740 | S4 zwxy; \ 741 | S4 zwxz; \ 742 | S4 zwxw; \ 743 | S4 zwyx; \ 744 | S4 zwyy; \ 745 | S4 zwyz; \ 746 | S4 zwyw; \ 747 | S4 zwzx; \ 748 | S4 zwzy; \ 749 | S4 zwzz; \ 750 | S4 zwzw; \ 751 | S4 zwwx; \ 752 | S4 zwwy; \ 753 | S4 zwwz; \ 754 | S4 zwww; \ 755 | S4 wxxx; \ 756 | S4 wxxy; \ 757 | S4 wxxz; \ 758 | S4 wxxw; \ 759 | S4 wxyx; \ 760 | S4 wxyy; \ 761 | S4 wxyz; \ 762 | S4 wxyw; \ 763 | S4 wxzx; \ 764 | S4 wxzy; \ 765 | S4 wxzz; \ 766 | S4 wxzw; \ 767 | S4 wxwx; \ 768 | S4 wxwy; \ 769 | S4 wxwz; \ 770 | S4 wxww; \ 771 | S4 wyxx; \ 772 | S4 wyxy; \ 773 | S4 wyxz; \ 774 | S4 wyxw; \ 775 | S4 wyyx; \ 776 | S4 wyyy; \ 777 | S4 wyyz; \ 778 | S4 wyyw; \ 779 | S4 wyzx; \ 780 | S4 wyzy; \ 781 | S4 wyzz; \ 782 | S4 wyzw; \ 783 | S4 wywx; \ 784 | S4 wywy; \ 785 | S4 wywz; \ 786 | S4 wyww; \ 787 | S4 wzxx; \ 788 | S4 wzxy; \ 789 | S4 wzxz; \ 790 | S4 wzxw; \ 791 | S4 wzyx; \ 792 | S4 wzyy; \ 793 | S4 wzyz; \ 794 | S4 wzyw; \ 795 | S4 wzzx; \ 796 | S4 wzzy; \ 797 | S4 wzzz; \ 798 | S4 wzzw; \ 799 | S4 wzwx; \ 800 | S4 wzwy; \ 801 | S4 wzwz; \ 802 | S4 wzww; \ 803 | S4 wwxx; \ 804 | S4 wwxy; \ 805 | S4 wwxz; \ 806 | S4 wwxw; \ 807 | S4 wwyx; \ 808 | S4 wwyy; \ 809 | S4 wwyz; \ 810 | S4 wwyw; \ 811 | S4 wwzx; \ 812 | S4 wwzy; \ 813 | S4 wwzz; \ 814 | S4 wwzw; \ 815 | S4 wwwx; \ 816 | S4 wwwy; \ 817 | S4 wwwz; \ 818 | S4 wwww 819 | -------------------------------------------------------------------------------- /Guts/tests.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #define Test_Eps 0.00001 4 | #define Test_ConstantEps 0.001 // hack for "fmod" 5 | 6 | #define TestEqual_x2(C, T) ML_Assert(all(C((T)x1, (T)y1) == C((T)x1, (T)y1))) 7 | #define TestEqual_x3(C, T) ML_Assert(all(C((T)x1, (T)y1, (T)z1) == C((T)x1, (T)y1, (T)z1))) 8 | #define TestEqual_x4(C, T) ML_Assert(all(C((T)x1, (T)y1, (T)z1, (T)w1) == C((T)x1, (T)y1, (T)z1, (T)w1))) 9 | 10 | #define TestNotEqual_x2(C, T) ML_Assert(any(C((T)x1, (T)y1) != C((T)y1, (T)x1))) 11 | #define TestNotEqual_x3(C, T) ML_Assert(any(C((T)x1, (T)y1, (T)z1) != C((T)z1, (T)x1, (T)y1))) 12 | #define TestNotEqual_x4(C, T) ML_Assert(any(C((T)x1, (T)y1, (T)z1, (T)w1) != C((T)w1, (T)z1, (T)y1, (T)x1))) 13 | 14 | #define TestOp_x2(C, T, op) ML_Assert(all((C((T)x1, (T)y1) op C((T)x2, (T)y2)) == C((T)x1 op(T) x2, (T)y1 op(T) y2))) 15 | #define TestOp_x3(C, T, op) ML_Assert(all((C((T)x1, (T)y1, (T)z1) op C((T)x2, (T)y2, (T)z2)) == C((T)x1 op(T) x2, (T)y1 op(T) y2, (T)z1 op(T) z2))) 16 | #define TestOp_x4(C, T, op) ML_Assert(all((C((T)x1, (T)y1, (T)z1, (T)w1) op C((T)x2, (T)y2, (T)z2, (T)w2)) == C((T)x1 op(T) x2, (T)y1 op(T) y2, (T)z1 op(T) z2, (T)w1 op(T) w2))) 17 | 18 | #define Test1_x2(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1)) == C(func((T)x1), func((T)y1)))) 19 | #define Test1_x3(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1, (T)z1)) == C(func((T)x1), func((T)y1), func((T)z1)))) 20 | #define Test1_x4(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1, (T)z1, (T)w1)) == C(func((T)x1), func((T)y1), func((T)z1), func((T)w1)))) 21 | 22 | #define Test2_x2(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1), C((T)x2, (T)y2)) == C(func((T)x1, (T)x2), func((T)y1, (T)y2)))) 23 | #define Test2_x3(C, T, func) ML_Assert(all(func(C((T)x1, (T)y1, (T)z1), C((T)x2, (T)y2, (T)z2)) == C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2)))) 24 | 25 | #define Test2_x4(C, T, func) \ 26 | ML_Assert(all(func(C((T)x1, (T)y1, (T)z1, (T)w1), C((T)x2, (T)y2, (T)z2, (T)w2)) == C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2)))) 27 | 28 | #define Test1_x3_eps(C, T, func) \ 29 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1)) - C(func((T)x1), func((T)y1), func((T)z1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1))) * (T)Test_Eps)) 30 | 31 | #define Test1_x4_eps(C, T, func) \ 32 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1)) - C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) * (T)Test_Eps)) 33 | 34 | #define Test2_x3_eps(C, T, func) \ 35 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1), C((T)x2, (T)y2, (T)z2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))) <= abs(C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))) * (T)Test_Eps)) 36 | #define Test2_x4_eps(C, T, func) \ 37 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1), C((T)x2, (T)y2, (T)z2, (T)w2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))) <= abs(C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))) * (T)Test_Eps)) 38 | 39 | #define Test1_x3_ceps(C, T, func) \ 40 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1)) - C(func((T)x1), func((T)y1), func((T)z1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1))) * (T)Test_Eps)) 41 | 42 | #define Test1_x4_ceps(C, T, func) \ 43 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1)) - C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) <= abs(C(func((T)x1), func((T)y1), func((T)z1), func((T)w1))) * (T)Test_Eps)) 44 | 45 | #define Test2_x3_ceps(C, T, func) \ 46 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1), C((T)x2, (T)y2, (T)z2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2))) <= (T)Test_ConstantEps)) 47 | #define Test2_x4_ceps(C, T, func) \ 48 | ML_Assert(all(abs(func(C((T)x1, (T)y1, (T)z1, (T)w1), C((T)x2, (T)y2, (T)z2, (T)w2)) - C(func((T)x1, (T)x2), func((T)y1, (T)y2), func((T)z1, (T)z2), func((T)w1, (T)w2))) <= (T)Test_ConstantEps)) 49 | 50 | #include "../ml.hlsli" 51 | 52 | #ifdef ML_NAMESPACE 53 | namespace ml { 54 | #endif 55 | 56 | void ML_Tests() { 57 | const uint32_t N = 10000; 58 | const float R = 10000.0f; 59 | 60 | uint32_t rngState = 1983; 61 | Rng::Hash::Initialize(rngState, 0, 0); 62 | 63 | for (uint32_t i = 0; i < N; i++) { 64 | { // Ops 65 | float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 66 | float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 67 | float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 68 | float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 69 | 70 | float x2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 71 | float y2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 72 | float z2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 73 | float w2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 74 | 75 | TestOp_x2(float2, float, -); 76 | TestOp_x2(float2, float, +); 77 | TestOp_x2(float2, float, *); 78 | TestOp_x2(float2, float, /); 79 | 80 | TestOp_x3(float3, float, -); 81 | TestOp_x3(float3, float, +); 82 | TestOp_x3(float3, float, *); 83 | TestOp_x3(float3, float, /); 84 | 85 | TestOp_x4(float4, float, -); 86 | TestOp_x4(float4, float, +); 87 | TestOp_x4(float4, float, *); 88 | TestOp_x4(float4, float, /); 89 | 90 | TestOp_x2(double2, double, -); 91 | TestOp_x2(double2, double, +); 92 | TestOp_x2(double2, double, *); 93 | TestOp_x2(double2, double, /); 94 | 95 | TestOp_x3(double3, double, -); 96 | TestOp_x3(double3, double, +); 97 | TestOp_x3(double3, double, *); 98 | TestOp_x3(double3, double, /); 99 | 100 | TestOp_x4(double4, double, +); 101 | TestOp_x4(double4, double, -); 102 | TestOp_x4(double4, double, *); 103 | TestOp_x4(double4, double, /); 104 | 105 | // Avoid division by "0" for integers 106 | if (x2 > -1 && x2 < 1) 107 | x2 = 1; 108 | 109 | if (y2 > -1 && y2 < 1) 110 | y2 = 1; 111 | 112 | if (z2 > -1 && z2 < 1) 113 | z2 = 1; 114 | 115 | if (w2 > -1 && w2 < 1) 116 | w2 = 1; 117 | 118 | TestOp_x2(int2, int32_t, -); 119 | TestOp_x2(int2, int32_t, +); 120 | TestOp_x2(int2, int32_t, *); 121 | TestOp_x2(int2, int32_t, /); 122 | 123 | TestOp_x3(int3, int32_t, -); 124 | TestOp_x3(int3, int32_t, +); 125 | TestOp_x3(int3, int32_t, *); 126 | TestOp_x3(int3, int32_t, /); 127 | 128 | TestOp_x4(int4, int32_t, -); 129 | TestOp_x4(int4, int32_t, +); 130 | TestOp_x4(int4, int32_t, *); 131 | TestOp_x4(int4, int32_t, /); 132 | 133 | TestOp_x2(uint2, uint32_t, -); 134 | TestOp_x2(uint2, uint32_t, +); 135 | TestOp_x2(uint2, uint32_t, *); 136 | TestOp_x2(uint2, uint32_t, /); 137 | 138 | TestOp_x3(uint3, uint32_t, -); 139 | TestOp_x3(uint3, uint32_t, +); 140 | TestOp_x3(uint3, uint32_t, *); 141 | TestOp_x3(uint3, uint32_t, /); 142 | 143 | TestOp_x4(uint4, uint32_t, -); 144 | TestOp_x4(uint4, uint32_t, +); 145 | TestOp_x4(uint4, uint32_t, *); 146 | TestOp_x4(uint4, uint32_t, /); 147 | } 148 | 149 | { // Integer ops 150 | uint32_t x1 = Rng::Hash::GetUint(rngState); 151 | uint32_t y1 = Rng::Hash::GetUint(rngState); 152 | uint32_t z1 = Rng::Hash::GetUint(rngState); 153 | uint32_t w1 = Rng::Hash::GetUint(rngState); 154 | 155 | uint32_t x2 = Rng::Hash::GetUint(rngState); 156 | uint32_t y2 = Rng::Hash::GetUint(rngState); 157 | uint32_t z2 = Rng::Hash::GetUint(rngState); 158 | uint32_t w2 = Rng::Hash::GetUint(rngState); 159 | 160 | TestOp_x2(int2, int32_t, &); 161 | TestOp_x2(int2, int32_t, |); 162 | TestOp_x2(int2, int32_t, ^); 163 | 164 | TestOp_x3(int3, int32_t, &); 165 | TestOp_x3(int3, int32_t, |); 166 | TestOp_x3(int3, int32_t, ^); 167 | 168 | TestOp_x4(int4, int32_t, &); 169 | TestOp_x4(int4, int32_t, |); 170 | TestOp_x4(int4, int32_t, ^); 171 | 172 | TestOp_x2(uint2, uint32_t, &); 173 | TestOp_x2(uint2, uint32_t, |); 174 | TestOp_x2(uint2, uint32_t, ^); 175 | 176 | TestOp_x3(uint3, uint32_t, &); 177 | TestOp_x3(uint3, uint32_t, |); 178 | TestOp_x3(uint3, uint32_t, ^); 179 | 180 | TestOp_x4(uint4, uint32_t, &); 181 | TestOp_x4(uint4, uint32_t, |); 182 | TestOp_x4(uint4, uint32_t, ^); 183 | 184 | // Shifts and mod: use sane 2nd arg 185 | x1 &= 0x7FFFFFFF; 186 | y1 &= 0x7FFFFFFF; 187 | z1 &= 0x7FFFFFFF; 188 | w1 &= 0x7FFFFFFF; 189 | 190 | x2 &= 31; 191 | y2 &= 31; 192 | z2 &= 31; 193 | w2 &= 31; 194 | 195 | TestOp_x2(int2, int32_t, <<); 196 | TestOp_x2(int2, int32_t, >>); 197 | 198 | TestOp_x3(int3, int32_t, <<); 199 | TestOp_x3(int3, int32_t, >>); 200 | 201 | TestOp_x4(int4, int32_t, <<); 202 | TestOp_x4(int4, int32_t, >>); 203 | 204 | TestOp_x2(uint2, uint32_t, <<); 205 | TestOp_x2(uint2, uint32_t, >>); 206 | 207 | TestOp_x3(uint3, uint32_t, <<); 208 | TestOp_x3(uint3, uint32_t, >>); 209 | 210 | TestOp_x4(uint4, uint32_t, <<); 211 | TestOp_x4(uint4, uint32_t, >>); 212 | 213 | // Avoid division by "0" 214 | if (!x2) 215 | x2 = 1; 216 | 217 | if (!y2) 218 | y2 = 1; 219 | 220 | if (!z2) 221 | z2 = 1; 222 | 223 | if (!w2) 224 | w2 = 1; 225 | 226 | TestOp_x2(int2, int32_t, %); 227 | TestOp_x3(int3, int32_t, %); 228 | TestOp_x4(int4, int32_t, %); 229 | TestOp_x2(uint2, uint32_t, %); 230 | TestOp_x3(uint3, uint32_t, %); 231 | TestOp_x4(uint4, uint32_t, %); 232 | } 233 | 234 | { // Math [-INF, INF] 235 | float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 236 | float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 237 | float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 238 | float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 239 | 240 | float x2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 241 | float y2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 242 | float z2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 243 | float w2 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 244 | 245 | Test1_x2(float2, float, degrees); 246 | Test1_x2(float2, float, radians); 247 | Test1_x2(float2, float, sign); 248 | Test1_x2(float2, float, abs); 249 | Test1_x2(float2, float, floor); 250 | Test1_x2(float2, float, ceil); 251 | Test1_x2(float2, float, frac); 252 | Test1_x2(float2, float, saturate); 253 | Test2_x2(float2, float, min); 254 | Test2_x2(float2, float, max); 255 | Test2_x2(float2, float, step); 256 | 257 | Test1_x2(float2, float, rcp); 258 | Test1_x2(float2, float, sin); 259 | Test1_x2(float2, float, cos); 260 | Test1_x2(float2, float, tan); 261 | Test1_x2(float2, float, atan); 262 | Test2_x2(float2, float, fmod); 263 | Test2_x2(float2, float, atan2); 264 | 265 | Test1_x2(double2, double, degrees); 266 | Test1_x2(double2, double, radians); 267 | Test1_x2(double2, double, sign); 268 | Test1_x2(double2, double, abs); 269 | Test1_x2(double2, double, floor); 270 | Test1_x2(double2, double, ceil); 271 | Test1_x2(double2, double, frac); 272 | Test1_x2(double2, double, saturate); 273 | Test2_x2(double2, double, min); 274 | Test2_x2(double2, double, max); 275 | Test2_x2(double2, double, step); 276 | 277 | Test1_x2(double2, double, rcp); 278 | Test1_x2(double2, double, sin); 279 | Test1_x2(double2, double, cos); 280 | Test1_x2(double2, double, tan); 281 | Test1_x2(double2, double, atan); 282 | Test2_x2(double2, double, fmod); 283 | Test2_x2(double2, double, atan2); 284 | 285 | Test1_x3(float3, float, degrees); 286 | Test1_x3(float3, float, radians); 287 | Test1_x3(float3, float, sign); 288 | Test1_x3(float3, float, abs); 289 | Test1_x3(float3, float, floor); 290 | Test1_x3(float3, float, ceil); 291 | Test1_x3(float3, float, frac); 292 | Test1_x3(float3, float, saturate); 293 | Test2_x3(float3, float, min); 294 | Test2_x3(float3, float, max); 295 | Test2_x3(float3, float, step); 296 | 297 | Test1_x3_eps(float3, float, rcp); 298 | Test1_x3_eps(float3, float, sin); 299 | Test1_x3_eps(float3, float, cos); 300 | Test1_x3_eps(float3, float, tan); 301 | Test1_x3_eps(float3, float, atan); 302 | Test2_x3_ceps(float3, float, fmod); 303 | Test2_x3_eps(float3, float, atan2); 304 | 305 | Test1_x3(double3, double, degrees); 306 | Test1_x3(double3, double, radians); 307 | Test1_x3(double3, double, sign); 308 | Test1_x3(double3, double, abs); 309 | Test1_x3(double3, double, floor); 310 | Test1_x3(double3, double, ceil); 311 | Test1_x3(double3, double, frac); 312 | Test1_x3(double3, double, saturate); 313 | Test2_x3(double3, double, min); 314 | Test2_x3(double3, double, max); 315 | Test2_x3(double3, double, step); 316 | 317 | Test1_x3_eps(double3, double, rcp); 318 | Test1_x3_eps(double3, double, sin); 319 | Test1_x3_eps(double3, double, cos); 320 | Test1_x3_eps(double3, double, tan); 321 | Test1_x3_eps(double3, double, atan); 322 | Test2_x3_ceps(double3, double, fmod); 323 | Test2_x3_eps(double3, double, atan2); 324 | 325 | Test1_x4(float4, float, degrees); 326 | Test1_x4(float4, float, radians); 327 | Test1_x4(float4, float, sign); 328 | Test1_x4(float4, float, abs); 329 | Test1_x4(float4, float, floor); 330 | Test1_x4(float4, float, ceil); 331 | Test1_x4(float4, float, frac); 332 | Test1_x4(float4, float, saturate); 333 | Test2_x4(float4, float, min); 334 | Test2_x4(float4, float, max); 335 | Test2_x4(float4, float, step); 336 | 337 | Test1_x4_eps(float4, float, rcp); 338 | Test1_x4_eps(float4, float, sin); 339 | Test1_x4_eps(float4, float, cos); 340 | Test1_x4_eps(float4, float, tan); 341 | Test1_x4_eps(float4, float, atan); 342 | Test2_x4_ceps(float4, float, fmod); 343 | Test2_x4_eps(float4, float, atan2); 344 | 345 | Test1_x4(double4, double, degrees); 346 | Test1_x4(double4, double, radians); 347 | Test1_x4(double4, double, sign); 348 | Test1_x4(double4, double, abs); 349 | Test1_x4(double4, double, floor); 350 | Test1_x4(double4, double, ceil); 351 | Test1_x4(double4, double, frac); 352 | Test1_x4(double4, double, saturate); 353 | Test2_x4(double4, double, min); 354 | Test2_x4(double4, double, max); 355 | Test2_x4(double4, double, step); 356 | 357 | Test1_x4_eps(double4, double, rcp); 358 | Test1_x4_eps(double4, double, sin); 359 | Test1_x4_eps(double4, double, cos); 360 | Test1_x4_eps(double4, double, tan); 361 | Test1_x4_eps(double4, double, atan); 362 | Test2_x4_ceps(double4, double, fmod); 363 | Test2_x4_eps(double4, double, atan2); 364 | 365 | // round: avoid fractional part = 0.5 366 | if (frac(x1) == 0.5f) 367 | x1 = uFloat(uFloat(x1).i + 1).f; 368 | 369 | if (frac(y1) == 0.5f) 370 | y1 = uFloat(uFloat(y1).i + 1).f; 371 | 372 | if (frac(z1) == 0.5f) 373 | z1 = uFloat(uFloat(z1).i + 1).f; 374 | 375 | if (frac(w1) == 0.5f) 376 | w1 = uFloat(uFloat(w1).i + 1).f; 377 | 378 | Test1_x2(float2, float, round); 379 | Test1_x3(float3, float, round); 380 | Test1_x4(float4, float, round); 381 | 382 | Test1_x2(double2, double, round); 383 | Test1_x3(double3, double, round); 384 | Test1_x4(double4, double, round); 385 | 386 | // pow/exp/exp2: do not use to large "x" and "y" to avoid "INF" 387 | x1 *= 32.0f / R; 388 | y1 *= 32.0f / R; 389 | z1 *= 32.0f / R; 390 | w1 *= 32.0f / R; 391 | 392 | x2 *= 32.0f / R; 393 | y2 *= 32.0f / R; 394 | z2 *= 32.0f / R; 395 | w2 *= 32.0f / R; 396 | 397 | Test1_x2(float2, float, exp); 398 | Test1_x3_eps(float3, float, exp); 399 | Test1_x4_eps(float4, float, exp); 400 | 401 | Test1_x2(double2, double, exp); 402 | Test1_x3_eps(double3, double, exp); 403 | Test1_x4_eps(double4, double, exp); 404 | 405 | Test1_x2(float2, float, exp2); 406 | Test1_x3_eps(float3, float, exp2); 407 | Test1_x4_eps(float4, float, exp2); 408 | 409 | Test1_x2(double2, double, exp2); 410 | Test1_x3_eps(double3, double, exp2); 411 | Test1_x4_eps(double4, double, exp2); 412 | 413 | // pow: "x" must be positive 414 | x1 = abs(x1); 415 | y1 = abs(y1); 416 | z1 = abs(z1); 417 | w1 = abs(w1); 418 | 419 | Test2_x2(float2, float, pow); 420 | Test2_x3_eps(float3, float, pow); 421 | Test2_x4_eps(float4, float, pow); 422 | 423 | Test2_x2(double2, double, pow); 424 | Test2_x3_eps(double3, double, pow); 425 | Test2_x4_eps(double4, double, pow); 426 | } 427 | 428 | { // Math (> 0) 429 | float x1 = Rng::Hash::GetFloat(rngState) * R; 430 | float y1 = Rng::Hash::GetFloat(rngState) * R; 431 | float z1 = Rng::Hash::GetFloat(rngState) * R; 432 | float w1 = Rng::Hash::GetFloat(rngState) * R; 433 | 434 | Test1_x2(float2, float, rsqrt); 435 | Test1_x2(float2, float, sqrt); 436 | Test1_x2(float2, float, log); 437 | Test1_x2(float2, float, log2); 438 | 439 | Test1_x2(double2, double, rsqrt); 440 | Test1_x2(double2, double, sqrt); 441 | Test1_x2(double2, double, log); 442 | Test1_x2(double2, double, log2); 443 | 444 | Test1_x3_eps(float3, float, rsqrt); 445 | Test1_x3_eps(float3, float, sqrt); 446 | Test1_x3_eps(float3, float, log); 447 | Test1_x3_eps(float3, float, log2); 448 | 449 | Test1_x3_eps(double3, double, rsqrt); 450 | Test1_x3_eps(double3, double, sqrt); 451 | Test1_x3_eps(double3, double, log); 452 | Test1_x3_eps(double3, double, log2); 453 | 454 | Test1_x4_eps(float4, float, rsqrt); 455 | Test1_x4_eps(float4, float, sqrt); 456 | Test1_x4_eps(float4, float, log); 457 | Test1_x4_eps(float4, float, log2); 458 | 459 | Test1_x4_eps(double4, double, rsqrt); 460 | Test1_x4_eps(double4, double, sqrt); 461 | Test1_x4_eps(double4, double, log); 462 | Test1_x4_eps(double4, double, log2); 463 | } 464 | 465 | { // Math [-1; 1] 466 | float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f; 467 | float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f; 468 | float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f; 469 | float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * 2.0f; 470 | 471 | Test1_x2(float2, float, asin); 472 | Test1_x2(float2, float, acos); 473 | 474 | Test1_x2(double2, double, asin); 475 | Test1_x2(double2, double, acos); 476 | 477 | Test1_x3_eps(float3, float, asin); 478 | Test1_x3_eps(float3, float, acos); 479 | 480 | Test1_x3_eps(double3, double, asin); 481 | Test1_x3_eps(double3, double, acos); 482 | 483 | Test1_x4_eps(float4, float, asin); 484 | Test1_x4_eps(float4, float, acos); 485 | 486 | Test1_x4_eps(double4, double, asin); 487 | Test1_x4_eps(double4, double, acos); 488 | } 489 | } 490 | 491 | { // == and != 492 | float x1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 493 | float y1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 494 | float z1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 495 | float w1 = (Rng::Hash::GetFloat(rngState) - 0.5f) * R; 496 | 497 | TestEqual_x2(int2, int32_t); 498 | TestEqual_x3(int3, int32_t); 499 | TestEqual_x4(int4, int32_t); 500 | 501 | TestEqual_x2(uint2, uint32_t); 502 | TestEqual_x3(uint3, uint32_t); 503 | TestEqual_x4(uint4, uint32_t); 504 | 505 | TestEqual_x2(float2, float); 506 | TestEqual_x3(float3, float); 507 | TestEqual_x4(float4, float); 508 | 509 | TestEqual_x2(double2, double); 510 | TestEqual_x3(double3, double); 511 | TestEqual_x4(double4, double); 512 | 513 | TestNotEqual_x2(int2, int32_t); 514 | TestNotEqual_x3(int3, int32_t); 515 | TestNotEqual_x4(int4, int32_t); 516 | 517 | TestNotEqual_x2(uint2, uint32_t); 518 | TestNotEqual_x3(uint3, uint32_t); 519 | TestNotEqual_x4(uint4, uint32_t); 520 | 521 | TestNotEqual_x2(float2, float); 522 | TestNotEqual_x3(float3, float); 523 | TestNotEqual_x4(float4, float); 524 | 525 | TestNotEqual_x2(double2, double); 526 | TestNotEqual_x3(double3, double); 527 | TestNotEqual_x4(double4, double); 528 | } 529 | 530 | { // +0 == -0 531 | ML_Assert(all(+0 == -0)); 532 | ML_Assert(all(int2(+0) == int2(-0))); 533 | ML_Assert(all(int3(+0) == int3(-0))); 534 | ML_Assert(all(int4(+0) == int4(-0))); 535 | 536 | ML_Assert(all(+0.0f == -0.0f)); 537 | ML_Assert(all(float2(+0.0f) == float2(-0.0f))); 538 | ML_Assert(all(float3(+0.0f) == float3(-0.0f))); 539 | ML_Assert(all(float4(+0.0f) == float4(-0.0f))); 540 | 541 | ML_Assert(all(+0.0 == -0.0)); 542 | ML_Assert(all(float2(+0.0) == float2(-0.0))); 543 | ML_Assert(all(float3(+0.0) == float3(-0.0))); 544 | ML_Assert(all(float4(+0.0) == float4(-0.0))); 545 | } 546 | 547 | { // NAN != NAN 548 | float myNanf = log(0.0f) * 0.0f; 549 | ML_Assert(myNanf != myNanf); 550 | ML_Assert(any(float2(myNanf) != float2(myNanf))); 551 | ML_Assert(any(float3(myNanf) != float3(myNanf))); 552 | ML_Assert(any(float4(myNanf) != float4(myNanf))); 553 | 554 | double myNan = log(0.0) * 0.0; 555 | ML_Assert(myNan != myNan); 556 | ML_Assert(any(double2(myNan) != double2(myNan))); 557 | ML_Assert(any(double3(myNan) != double3(myNan))); 558 | ML_Assert(any(double4(myNan) != double4(myNan))); 559 | } 560 | } 561 | 562 | #ifdef ML_NAMESPACE 563 | } 564 | #endif 565 | -------------------------------------------------------------------------------- /Guts/u32.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | #pragma once 4 | 5 | //====================================================================================================================== 6 | // uint2 7 | //====================================================================================================================== 8 | 9 | union uint2 { 10 | v2i mm; 11 | 12 | struct { 13 | uint32_t a[COORD_2D]; 14 | }; 15 | 16 | struct { 17 | uint32_t x, y; 18 | }; 19 | 20 | ML_SWIZZLE_2(uint2, uint32_t); 21 | 22 | public: 23 | ML_INLINE uint2() 24 | : mm(0) { 25 | } 26 | 27 | ML_INLINE uint2(uint32_t c) 28 | : x(c), y(c) { 29 | } 30 | 31 | ML_INLINE uint2(uint32_t _x, uint32_t _y) 32 | : x(_x), y(_y) { 33 | } 34 | 35 | ML_INLINE uint2(const uint2& v) = default; 36 | 37 | // Set 38 | 39 | ML_INLINE void operator=(const uint2& v) { 40 | mm = v.mm; 41 | } 42 | 43 | // Conversion 44 | 45 | ML_INLINE operator int2() const; 46 | ML_INLINE operator float2() const; 47 | ML_INLINE operator double2() const; 48 | 49 | // Compare 50 | 51 | ML_COMPARE_UNOPT(bool2, uint2, <) 52 | ML_COMPARE_UNOPT(bool2, uint2, <=) 53 | ML_COMPARE_UNOPT(bool2, uint2, ==) 54 | ML_COMPARE_UNOPT(bool2, uint2, >=) 55 | ML_COMPARE_UNOPT(bool2, uint2, >) 56 | ML_COMPARE_UNOPT(bool2, uint2, !=) 57 | 58 | // Ops 59 | 60 | ML_OP_UNOPT(uint2, uint32_t, -, -=) 61 | ML_OP_UNOPT(uint2, uint32_t, +, +=) 62 | ML_OP_UNOPT(uint2, uint32_t, *, *=) 63 | ML_OP_UNOPT(uint2, uint32_t, /, /=) 64 | ML_OP_UNOPT(uint2, uint32_t, %, %=) 65 | ML_OP_UNOPT(uint2, uint32_t, <<, <<=) 66 | ML_OP_UNOPT(uint2, uint32_t, >>, >>=) 67 | ML_OP_UNOPT(uint2, uint32_t, &, &=) 68 | ML_OP_UNOPT(uint2, uint32_t, |, |=) 69 | ML_OP_UNOPT(uint2, uint32_t, ^, ^=) 70 | }; 71 | 72 | ML_INLINE uint2 min(const uint2& x, const uint2& y) { 73 | return uint2(min(x.x, y.x), min(x.y, y.y)); 74 | } 75 | 76 | ML_INLINE uint2 max(const uint2& x, const uint2& y) { 77 | return uint2(max(x.x, y.x), max(x.y, y.y)); 78 | } 79 | 80 | //====================================================================================================================== 81 | // uint3 82 | //====================================================================================================================== 83 | 84 | union uint3 { 85 | v4i xmm; 86 | 87 | struct { 88 | uint32_t a[COORD_3D]; 89 | }; 90 | 91 | struct { 92 | uint32_t x, y, z; 93 | }; 94 | 95 | ML_SWIZZLE_3(v4u_swizzle2, uint2, v4u_swizzle3, uint3); 96 | 97 | public: 98 | ML_INLINE uint3() 99 | : xmm(_mm_setzero_si128()) { 100 | } 101 | 102 | ML_INLINE uint3(uint32_t c) 103 | : xmm(_mm_set1_epi32(c)) { 104 | } 105 | 106 | ML_INLINE uint3(uint32_t _x, uint32_t _y, uint32_t _z) 107 | : xmm(v4i_set(_x, _y, _z, 1)) { 108 | } 109 | 110 | ML_INLINE uint3(const uint2& v, uint32_t _z) 111 | : xmm(v4i_set(v.x, v.y, _z, 1)) { 112 | } 113 | 114 | ML_INLINE uint3(uint32_t _x, const uint2& v) 115 | : xmm(v4i_set(_x, v.x, v.y, 1)) { 116 | } 117 | 118 | ML_INLINE uint3(const v4i& v) 119 | : xmm(v) { 120 | } 121 | 122 | ML_INLINE uint3(const uint32_t* v3) 123 | : xmm(v4i_set(v3[0], v3[1], v3[2], 1)) { 124 | } 125 | 126 | ML_INLINE uint3(const uint3& v) = default; 127 | 128 | // Set 129 | 130 | ML_INLINE void operator=(const uint3& v) { 131 | xmm = v.xmm; 132 | } 133 | 134 | // Conversion 135 | 136 | ML_INLINE operator int3() const; 137 | ML_INLINE operator float3() const; 138 | ML_INLINE operator double3() const; 139 | 140 | // Compare 141 | 142 | ML_COMPARE(bool3, uint3, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm) 143 | ML_COMPARE(bool3, uint3, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm) 144 | ML_COMPARE(bool3, uint3, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm) 145 | ML_COMPARE(bool3, uint3, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm) 146 | ML_COMPARE(bool3, uint3, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm) 147 | ML_COMPARE(bool3, uint3, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm) 148 | 149 | // Ops 150 | 151 | ML_OP(uint3, uint32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm) 152 | ML_OP(uint3, uint32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm) 153 | ML_OP(uint3, uint32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm) 154 | ML_OP(uint3, uint32_t, /, /=, _mm_div_epu32, _mm_set1_epi32, xmm) 155 | ML_OP(uint3, uint32_t, %, %=, v4u_mod, _mm_set1_epi32, xmm) 156 | ML_OP(uint3, uint32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm) 157 | ML_OP(uint3, uint32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm) 158 | ML_OP(uint3, uint32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm) 159 | ML_OP(uint3, uint32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm) 160 | ML_OP(uint3, uint32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm) 161 | 162 | // Misc 163 | 164 | ML_INLINE operator v4i() const { 165 | return xmm; 166 | } 167 | 168 | static ML_INLINE uint3 Zero() { 169 | return _mm_setzero_si128(); 170 | } 171 | }; 172 | 173 | ML_INLINE uint3 min(const uint3& x, const uint3& y) { 174 | return _mm_min_epu32(x.xmm, y.xmm); 175 | } 176 | 177 | ML_INLINE uint3 max(const uint3& x, const uint3& y) { 178 | return _mm_max_epu32(x.xmm, y.xmm); 179 | } 180 | 181 | //====================================================================================================================== 182 | // uint4 183 | //====================================================================================================================== 184 | 185 | union uint4 { 186 | v4i xmm; 187 | 188 | struct { 189 | uint32_t a[COORD_4D]; 190 | }; 191 | 192 | struct { 193 | uint32_t x, y, z, w; 194 | }; 195 | 196 | ML_SWIZZLE_4(v4u_swizzle2, uint2, v4u_swizzle3, uint3, v4u_swizzle4, uint4); 197 | 198 | public: 199 | ML_INLINE uint4() 200 | : xmm(_mm_setzero_si128()) { 201 | } 202 | 203 | ML_INLINE uint4(uint32_t c) 204 | : xmm(_mm_set1_epi32(c)) { 205 | } 206 | 207 | ML_INLINE uint4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) 208 | : xmm(v4i_set(_x, _y, _z, _w)) { 209 | } 210 | 211 | ML_INLINE uint4(const uint3& v, uint32_t _w) 212 | : xmm(v4i_set(v.x, v.y, v.z, _w)) { 213 | } 214 | 215 | ML_INLINE uint4(const uint2& a, const uint2& b) 216 | : xmm(v4i_set(a.x, a.y, b.x, b.y)) { 217 | } 218 | 219 | ML_INLINE uint4(uint32_t _x, const uint3& v) 220 | : xmm(v4i_set(_x, v.x, v.y, v.z)) { 221 | } 222 | 223 | ML_INLINE uint4(const v4i& v) 224 | : xmm(v) { 225 | } 226 | 227 | ML_INLINE uint4(const uint4& v) = default; 228 | 229 | // Set 230 | 231 | ML_INLINE void operator=(const uint4& v) { 232 | xmm = v.xmm; 233 | } 234 | 235 | // Conversion 236 | 237 | ML_INLINE operator int4() const; 238 | ML_INLINE operator float4() const; 239 | ML_INLINE operator double4() const; 240 | 241 | // Compare 242 | 243 | ML_COMPARE(bool4, uint4, <, _mm_cmplt_epi32, _mm_movemask_epi32, xmm) 244 | ML_COMPARE(bool4, uint4, <=, _mm_cmple_epi32, _mm_movemask_epi32, xmm) 245 | ML_COMPARE(bool4, uint4, ==, _mm_cmpeq_epi32, _mm_movemask_epi32, xmm) 246 | ML_COMPARE(bool4, uint4, >, _mm_cmpgt_epi32, _mm_movemask_epi32, xmm) 247 | ML_COMPARE(bool4, uint4, >=, _mm_cmpge_epi32, _mm_movemask_epi32, xmm) 248 | ML_COMPARE(bool4, uint4, !=, _mm_cmpneq_epi32, _mm_movemask_epi32, xmm) 249 | 250 | // Ops 251 | 252 | ML_OP(uint4, uint32_t, -, -=, _mm_sub_epi32, _mm_set1_epi32, xmm) 253 | ML_OP(uint4, uint32_t, +, +=, _mm_add_epi32, _mm_set1_epi32, xmm) 254 | ML_OP(uint4, uint32_t, *, *=, _mm_mullo_epi32, _mm_set1_epi32, xmm) 255 | ML_OP(uint4, uint32_t, /, /=, _mm_div_epu32, _mm_set1_epi32, xmm) 256 | ML_OP(uint4, uint32_t, %, %=, v4u_mod, _mm_set1_epi32, xmm) 257 | ML_OP(uint4, uint32_t, <<, <<=, _mm_sllv_epi32, _mm_set1_epi32, xmm) 258 | ML_OP(uint4, uint32_t, >>, >>=, _mm_srlv_epi32, _mm_set1_epi32, xmm) 259 | ML_OP(uint4, uint32_t, &, &=, _mm_and_si128, _mm_set1_epi32, xmm) 260 | ML_OP(uint4, uint32_t, |, |=, _mm_or_si128, _mm_set1_epi32, xmm) 261 | ML_OP(uint4, uint32_t, ^, ^=, _mm_xor_si128, _mm_set1_epi32, xmm) 262 | 263 | // Misc 264 | 265 | ML_INLINE operator v4i() const { 266 | return xmm; 267 | } 268 | 269 | static ML_INLINE uint4 Zero() { 270 | return _mm_setzero_si128(); 271 | } 272 | }; 273 | 274 | ML_INLINE uint4 min(const uint4& x, const uint4& y) { 275 | return _mm_min_epu32(x.xmm, y.xmm); 276 | } 277 | 278 | ML_INLINE uint4 max(const uint4& x, const uint4& y) { 279 | return _mm_max_epu32(x.xmm, y.xmm); 280 | } 281 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the "Software"), 5 | to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the 8 | Software is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MathLib (ML) 2 | 3 | *ML* is a cross-platform header-only *SSE/AVX/NEON*-accelerated math library, designed for computer graphics. It serves two goals: 4 | - accelerate performance using *SSE/AVX/NEON* intrinsics 5 | - be HLSL compatible and deliver functionality to both CPU and shader code without code duplication 6 | 7 | Features: 8 | - compile-time optimization level specialization: SSE3 (and below), +SSE4, +AVX1, +AVX2 (or NEON on ARM via [*sse2neon*](https://github.com/DLTcollab/sse2neon)) 9 | - `int2`, `int3` and `int4` types 10 | - `uint2`, `uint3` and `uint4` types 11 | - `float2`, `float3`, `float4` and `float4x4` types 12 | - `double2`, `double3`, `double4` and `double4x4` types 13 | - `bool2`, `bool3` and `bool4` types 14 | - overloaded operators 15 | - vector swizzling 16 | - common functions: `all`, `any`, `sign`, `abs`, `floor`, `round`, `ceil`, `fmod`, `frac`, `min`, `max`, `clamp`, `saturate`, `lerp`, `step`, `smoothstep` and `linearstep` 17 | - transcendental functions: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`, `atan2`, `sqrt`, `rsqrt`, `rcp`, `pow`, `log`, `log2`, `exp` and `exp2` 18 | - data conversion and packing functionality - FP32, FP16, SNORM and UNORM (with any number of bits per component) 19 | - vectors and matrices 20 | - linear algebra miscellaneous functionality 21 | - projective math miscellaneous functionality 22 | - frustum & AABB primitives 23 | - random numbers generation 24 | - sorting 25 | 26 | Important: 27 | - `sizeof(int3/uint3/float3) == sizeof(float4)` on CPU 28 | - `sizeof(double3) == sizeof(double4)` on CPU 29 | - `using namespace std` can lead to name collisions 30 | - inclusion of `cmath` and/or `cstdlib` (even implicitly) after `ml.h` leads to name collisions 31 | 32 | Also includes `ml.hlsli` file which is a standalone HLSL math library usable in C++ code. 33 | 34 | ## License 35 | 36 | *ML* is licensed under the MIT License. 37 | -------------------------------------------------------------------------------- /ml.h: -------------------------------------------------------------------------------- 1 | // © 2021 NVIDIA Corporation 2 | 3 | /* 4 | IMPORTANT: 5 | - intrinsic related headers must not be included *AFTER* ML inclusion 6 | - "ML_NAMESPACE" macro can be defined to wrap the entire ML into "ml" namespace 7 | - sizeof(3-component vector) == sizeof(4-component vector) because of SSE 8 | */ 9 | 10 | #pragma once 11 | 12 | #define ML_VERSION 9 13 | #define ML_VERSION_DATE "2 October 2025" 14 | 15 | //====================================================================================================================== 16 | // Constants 17 | //====================================================================================================================== 18 | 19 | // Intrinsic levels (everything above "ML_INTRINSIC_LEVEL" is emulated) 20 | #define ML_INTRINSIC_SSE3 0 // +SSE1, +SSE2, +SSE3, +SSSE3 ("-mssse3" in GCC/Clang) 21 | #define ML_INTRINSIC_SSE4 1 // +SSE4.1, +SSE4.2 ("-msse4.2" in GCC/Clang) 22 | #define ML_INTRINSIC_AVX1 2 // +AVX1, +FP16C ("-mf16c" in GCC/Clang) 23 | #define ML_INTRINSIC_AVX2 3 // +AVX2, +FMA3, +bit shift, +swizzle ("-mavx2 -mfma" in GCC/Clang) 24 | 25 | //====================================================================================================================== 26 | // Settings 27 | //====================================================================================================================== 28 | 29 | // Can be set to wrap the library into "ml" namespace 30 | #ifndef ML_NAMESPACE 31 | // #define ML_NAMESPACE 32 | #endif 33 | 34 | // Selected intrinsic level (try to guess) 35 | #ifndef ML_INTRINSIC_LEVEL 36 | # if (defined(__AVX2__) && defined(__FMA__)) 37 | # define ML_INTRINSIC_LEVEL ML_INTRINSIC_AVX2 38 | # elif defined(__F16C__) 39 | # define ML_INTRINSIC_LEVEL ML_INTRINSIC_AVX1 40 | # elif defined(__SSE4_2__) 41 | # define ML_INTRINSIC_LEVEL ML_INTRINSIC_SSE4 42 | # else 43 | # define ML_INTRINSIC_LEVEL ML_INTRINSIC_SSE3 44 | # endif 45 | #endif 46 | 47 | // ARM? 48 | #if (defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM)) 49 | # define ML_ARM 50 | #endif 51 | 52 | // SVML availability 53 | #ifndef ML_SVML_AVAILABLE 54 | # ifdef ML_ARM 55 | # define ML_SVML_AVAILABLE 0 56 | # else 57 | # define ML_SVML_AVAILABLE (_MSC_VER >= 1920 && __clang__ == 0) 58 | # endif 59 | #endif 60 | 61 | // More precision (a little bit slower) 62 | #ifndef ML_NEWTONRAPHSON_APROXIMATION 63 | # define ML_NEWTONRAPHSON_APROXIMATION 1 64 | #endif 65 | 66 | // Only for debugging (useful to debug issues in horizontal operations) 67 | #ifndef ML_CHECK_W_IS_ZERO 68 | # define ML_CHECK_W_IS_ZERO 0 69 | #endif 70 | 71 | // Only for debugging (generate exeptions in rounding operations, only for SSE4) 72 | #ifndef ML_EXEPTIONS 73 | # define ML_EXEPTIONS 0 74 | #endif 75 | 76 | // Reversed depth 77 | #ifndef ML_DEPTH_REVERSED 78 | # define ML_DEPTH_REVERSED 1 79 | #endif 80 | 81 | // Can be handy for classic OpenGL 82 | #ifndef ML_OGL 83 | # define ML_OGL 0 84 | #endif 85 | 86 | // Depth range 87 | #ifndef ML_DEPTH_RANGE_NEAR 88 | # define ML_DEPTH_RANGE_NEAR 0.0f 89 | #endif 90 | 91 | #ifndef ML_ML_DEPTH_RANGE_FAR 92 | # define ML_DEPTH_RANGE_FAR 1.0f 93 | #endif 94 | 95 | // Inline preference 96 | #ifndef ML_INLINE 97 | # if (defined(__GNUC__) || defined(__clang__)) 98 | # define ML_INLINE __attribute__((always_inline)) inline 99 | # else 100 | # define ML_INLINE __forceinline 101 | # endif 102 | #endif 103 | 104 | //====================================================================================================================== 105 | // Macro stuff 106 | //====================================================================================================================== 107 | 108 | // Compiler and environment 109 | 110 | #if defined(__GNUC__) 111 | # pragma GCC diagnostic push 112 | # pragma GCC diagnostic ignored "-Wstrict-aliasing" 113 | 114 | # define ML_ALIGN(alignment, x) x __attribute__((aligned(alignment))) 115 | #elif defined(__clang__) 116 | # pragma clang diagnostic push 117 | # pragma clang diagnostic ignored "-Wstrict-aliasing" 118 | 119 | # define ML_ALIGN(alignment, x) x __attribute__((aligned(alignment))) 120 | #else 121 | # pragma warning(push) 122 | # pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union 123 | 124 | # define ML_ALIGN(alignment, x) __declspec(align(alignment)) x 125 | #endif 126 | 127 | // Headers 128 | 129 | #include // overloaded floor, round, ceil, fmod, sin, cos, tan, asin, acos, atan, atan2, sqrt, pow, log, log2, exp, exp2 130 | #include // overloaded abs 131 | 132 | #include 133 | 134 | #ifndef _WIN32 135 | # include // TODO: needed? 136 | #endif 137 | 138 | #if (defined(__i386__) || defined(__x86_64__) || defined(__SCE__)) 139 | # include 140 | #elif (defined(ML_ARM)) 141 | # include "sse2neon.h" 142 | #else 143 | # include 144 | # if (ML_SVML_AVAILABLE || ML_INTRINSIC_LEVEL >= ML_INTRINSIC_AVX1) 145 | # include // SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA, SVML 146 | # elif (ML_INTRINSIC_LEVEL >= ML_INTRINSIC_SSE4) 147 | # include // SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 148 | # else 149 | # include // SSE, SSE2, SSE3, SSSE3 150 | # endif 151 | #endif 152 | 153 | // Misc 154 | #define ML_Unused(...) \ 155 | do { \ 156 | (void)sizeof(__VA_ARGS__); \ 157 | } while (0) 158 | #define ML_Stringify_(token) #token 159 | #define ML_Stringify(token) ML_Stringify_(token) 160 | 161 | #if ML_EXEPTIONS 162 | # define ML_ROUNDING_EXEPTIONS_MASK _MM_FROUND_RAISE_EXC 163 | #else 164 | # define ML_ROUNDING_EXEPTIONS_MASK _MM_FROUND_NO_EXC 165 | #endif 166 | 167 | // Debugging 168 | 169 | #define ML_StaticAssertMsg(x, msg) static_assert(x, msg) 170 | 171 | #ifdef _DEBUG 172 | # include // assert 173 | 174 | # define ML_Assert(x) assert(x) 175 | # define ML_AssertMsg(x, msg) assert(msg&& x) 176 | #else 177 | # define ML_Assert(x) ((void)0) 178 | # define ML_AssertMsg(x, msg) ((void)0) 179 | #endif 180 | 181 | // Normalized device coordinates 182 | 183 | #if ML_OGL // Depth range [-1; 1], origin "lower left" 184 | # define ML_NDC_NEAR_NO_REVERSE -1.0f 185 | # define ML_DEPTH_C0 (0.5f * (ML_DEPTH_RANGE_FAR - ML_DEPTH_RANGE_NEAR)) 186 | # define ML_DEPTH_C1 (0.5f * (ML_DEPTH_RANGE_FAR + ML_DEPTH_RANGE_NEAR)) 187 | 188 | template 189 | ML_INLINE T ML_ModifyProjZ(bool isReversed, T c2, T c3) { 190 | return isReversed ? -c2 : c2; 191 | } 192 | 193 | #else // Depth range [0; 1], origin "upper left" 194 | # define ML_NDC_NEAR_NO_REVERSE 0.0f 195 | # define ML_DEPTH_C0 (ML_DEPTH_RANGE_FAR - ML_DEPTH_RANGE_NEAR) 196 | # define ML_DEPTH_C1 ML_DEPTH_RANGE_NEAR 197 | 198 | template 199 | ML_INLINE T ML_ModifyProjZ(bool isReversed, T c2, T c3) { 200 | return T(0.5) * ((isReversed ? -c2 : c2) + c3); 201 | } 202 | 203 | #endif 204 | 205 | #define ML_NDC_FAR_NO_REVERSE 1.0f 206 | 207 | #if ML_DEPTH_REVERSED 208 | # define ML_NDC_NEAR ML_NDC_FAR_NO_REVERSE 209 | # define ML_NDC_FAR ML_NDC_NEAR_NO_REVERSE 210 | # define ML_DEPTH_EPS -1e-7f 211 | #else 212 | # define ML_NDC_NEAR ML_NDC_NEAR_NO_REVERSE 213 | # define ML_NDC_FAR ML_NDC_FAR_NO_REVERSE 214 | # define ML_DEPTH_EPS 1e-7f 215 | #endif 216 | 217 | // TODO 218 | 219 | /* 220 | - add some missing HLSL-compatible math functionality 221 | - find a way to improve emulation of intrinsics currently using "for (size_t i = 0;" 222 | - minimize "#ifndef __cplusplus" usage in "ml.hlsli" 223 | - GCC doesn't support "members with constructors in anonymous aggregates" 224 | - search for TODO 225 | */ 226 | 227 | //====================================================================================================================== 228 | // MathLib 229 | //====================================================================================================================== 230 | 231 | #ifdef ML_NAMESPACE 232 | namespace ml { 233 | #endif 234 | 235 | //====================================================================================================================== 236 | // Forward declarations 237 | //====================================================================================================================== 238 | 239 | struct bool2; 240 | struct bool3; 241 | struct bool4; 242 | 243 | union int2; 244 | union int3; 245 | union int4; 246 | 247 | typedef uint32_t uint; 248 | union uint2; 249 | union uint3; 250 | union uint4; 251 | 252 | union float2; 253 | union float3; 254 | union float4; 255 | union float4x4; 256 | 257 | union double2; 258 | union double3; 259 | union double4; 260 | union double4x4; 261 | 262 | //====================================================================================================================== 263 | // Enums 264 | //====================================================================================================================== 265 | 266 | enum eStyle : uint8_t { 267 | STYLE_D3D, 268 | STYLE_OGL, 269 | }; 270 | 271 | enum eClip : uint8_t { 272 | CLIP_OUT, 273 | CLIP_IN, 274 | CLIP_PARTIAL, 275 | }; 276 | 277 | enum eCoordinate : uint32_t { 278 | COORD_X = 0, 279 | COORD_Y, 280 | COORD_Z, 281 | COORD_W, 282 | 283 | COORD_2D = 2, 284 | COORD_3D, 285 | COORD_4D, 286 | }; 287 | 288 | enum ePlaneType : uint32_t { 289 | PLANE_LEFT, 290 | PLANE_RIGHT, 291 | PLANE_BOTTOM, 292 | PLANE_TOP, 293 | PLANE_NEAR, 294 | PLANE_FAR, 295 | 296 | PLANES_NUM, 297 | PLANES_NO_NEAR_FAR = 4, 298 | PLANES_NO_FAR = 5, 299 | 300 | PLANE_MASK_L = 1 << PLANE_LEFT, 301 | PLANE_MASK_R = 1 << PLANE_RIGHT, 302 | PLANE_MASK_B = 1 << PLANE_BOTTOM, 303 | PLANE_MASK_T = 1 << PLANE_TOP, 304 | PLANE_MASK_N = 1 << PLANE_NEAR, 305 | PLANE_MASK_F = 1 << PLANE_FAR, 306 | 307 | PLANE_MASK_NONE = 0, 308 | PLANE_MASK_LRBT = PLANE_MASK_L | PLANE_MASK_R | PLANE_MASK_B | PLANE_MASK_T, 309 | PLANE_MASK_NF = PLANE_MASK_N | PLANE_MASK_F, 310 | PLANE_MASK_LRBTNF = PLANE_MASK_LRBT | PLANE_MASK_NF, 311 | }; 312 | 313 | enum eProjectionData { 314 | PROJ_ZNEAR, 315 | PROJ_ZFAR, 316 | PROJ_ASPECT, 317 | PROJ_FOVX, 318 | PROJ_FOVY, 319 | PROJ_MINX, 320 | PROJ_MAXX, 321 | PROJ_MINY, 322 | PROJ_MAXY, 323 | PROJ_DIRX, 324 | PROJ_DIRY, 325 | PROJ_ANGLEMINX, 326 | PROJ_ANGLEMAXX, 327 | PROJ_ANGLEMINY, 328 | PROJ_ANGLEMAXY, 329 | 330 | PROJ_NUM, 331 | }; 332 | 333 | enum eProjectionFlag { 334 | PROJ_ORTHO = 0x00000001, 335 | PROJ_REVERSED_Z = 0x00000002, 336 | PROJ_LEFT_HANDED = 0x00000004, 337 | }; 338 | 339 | template 340 | ML_INLINE void Swap(T& x, T& y) { 341 | T t = x; 342 | x = y; 343 | y = t; 344 | } 345 | 346 | //====================================================================================================================== 347 | // Intrinsic emulation 348 | //====================================================================================================================== 349 | 350 | #include "Guts/emulation.h" 351 | 352 | //====================================================================================================================== 353 | // Math 354 | //====================================================================================================================== 355 | 356 | #include "Guts/math.h" 357 | 358 | //====================================================================================================================== 359 | // Floating point tricks 360 | //====================================================================================================================== 361 | 362 | union uFloat { 363 | float f; 364 | uint32_t i; 365 | 366 | ML_INLINE uFloat() 367 | : i(0) { 368 | } 369 | 370 | ML_INLINE uFloat(float x) 371 | : f(x) { 372 | } 373 | 374 | ML_INLINE uFloat(uint32_t x) 375 | : i(x) { 376 | } 377 | 378 | ML_INLINE void abs() { 379 | i &= ~(1 << 31); 380 | } 381 | 382 | ML_INLINE bool IsNegative() const { 383 | return (i >> 31) != 0; 384 | } 385 | 386 | ML_INLINE uint32_t Mantissa() const { 387 | return i & ((1 << 23) - 1); 388 | } 389 | 390 | ML_INLINE uint32_t Exponent() const { 391 | return (i >> 23) & 255; 392 | } 393 | 394 | ML_INLINE bool IsInf() const { 395 | return Exponent() == 255 && Mantissa() == 0; 396 | } 397 | 398 | ML_INLINE bool IsNan() const { 399 | return Exponent() == 255 && Mantissa() != 0; 400 | } 401 | 402 | static ML_INLINE float PrecisionGreater(float x) { 403 | uFloat y(x); 404 | y.i++; 405 | 406 | return y.f - x; 407 | } 408 | 409 | static ML_INLINE float PrecisionLess(float x) { 410 | uFloat y(x); 411 | y.i--; 412 | 413 | return y.f - x; 414 | } 415 | }; 416 | 417 | union uDouble { 418 | double f; 419 | uint64_t i; 420 | 421 | ML_INLINE uDouble() 422 | : i(0) { 423 | } 424 | 425 | ML_INLINE uDouble(double x) 426 | : f(x) { 427 | } 428 | 429 | ML_INLINE uDouble(uint64_t x) 430 | : i(x) { 431 | } 432 | 433 | ML_INLINE bool IsNegative() const { 434 | return (i >> 63) != 0; 435 | } 436 | 437 | ML_INLINE void abs() { 438 | i &= ~(1ULL << 63); 439 | } 440 | 441 | ML_INLINE uint64_t Mantissa() const { 442 | return i & ((1ULL << 52) - 1); 443 | } 444 | 445 | ML_INLINE uint64_t Exponent() const { 446 | return (i >> 52) & 2047; 447 | } 448 | 449 | ML_INLINE bool IsInf() const { 450 | return Exponent() == 2047 && Mantissa() == 0; 451 | } 452 | 453 | ML_INLINE bool IsNan() const { 454 | return Exponent() == 2047 && Mantissa() != 0; 455 | } 456 | 457 | static ML_INLINE double PrecisionGreater(double x) { 458 | uDouble y(x); 459 | y.i++; 460 | 461 | return y.f - x; 462 | } 463 | 464 | static ML_INLINE double PrecisionLess(double x) { 465 | uDouble y(x); 466 | y.i--; 467 | 468 | return y.f - x; 469 | } 470 | }; 471 | 472 | //====================================================================================================================== 473 | // Data types 474 | //====================================================================================================================== 475 | 476 | #define ML_COMPARE_UNOPT(B, C, op) \ 477 | ML_INLINE B operator op(const C& v) const { \ 478 | int32_t mask = x op v.x ? 0x1 : 0; \ 479 | mask |= y op v.y ? 0x2 : 0; \ 480 | return B(mask); \ 481 | } 482 | 483 | #define ML_COMPARE(B, C, op, f, movemask, reg) \ 484 | ML_INLINE B operator op(const C& v) const { \ 485 | return B(movemask(f(reg, v.reg))); \ 486 | } 487 | 488 | #define ML_OP_UNOPT(C, T, op, opeq) \ 489 | ML_INLINE C operator op(const C& v) const { \ 490 | return C(x op v.x, y op v.y); \ 491 | } \ 492 | ML_INLINE friend C operator op(T c, const C& v) { \ 493 | return C(c op v.x, c op v.y); \ 494 | } \ 495 | ML_INLINE friend C operator op(const C& v, T c) { \ 496 | return C(v.x op c, v.y op c); \ 497 | } \ 498 | ML_INLINE void operator opeq(const C& v) { \ 499 | x opeq v.x; \ 500 | y opeq v.y; \ 501 | } \ 502 | ML_INLINE void operator opeq(T c) { \ 503 | x opeq c; \ 504 | y opeq c; \ 505 | } 506 | 507 | #define ML_OP(C, T, op, opeq, f, broadcast, reg) \ 508 | ML_INLINE C operator op(const C& v) const { \ 509 | return f(reg, v.reg); \ 510 | } \ 511 | ML_INLINE friend C operator op(T c, const C& v) { \ 512 | return f(broadcast(c), v.reg); \ 513 | } \ 514 | ML_INLINE friend C operator op(const C& v, T c) { \ 515 | return f(v.reg, broadcast(c)); \ 516 | } \ 517 | ML_INLINE void operator opeq(const C& v) { \ 518 | reg = f(reg, v.reg); \ 519 | } \ 520 | ML_INLINE void operator opeq(T c) { \ 521 | reg = f(reg, broadcast(c)); \ 522 | } 523 | 524 | // Vector swizzling 525 | #include "Guts/swizzle.h" 526 | 527 | // Boolean (1 bit emulation) 528 | #include "Guts/bool1.h" 529 | 530 | // Integer 531 | #include "Guts/i32.h" 532 | #include "Guts/u32.h" 533 | 534 | // Float 535 | #include "Guts/f16.h" 536 | #include "Guts/f32.h" 537 | #include "Guts/f64.h" 538 | 539 | // Conversion 540 | #include "Guts/conversion.h" 541 | 542 | #undef ML_COMPARE_UNOPT 543 | #undef ML_COMPARE 544 | #undef ML_OP_UNOPT 545 | #undef ML_OP 546 | 547 | #undef ML_SWIZZLE_2 548 | #undef ML_SWIZZLE_3 549 | #undef ML_SWIZZLE_4 550 | 551 | #undef ML_X 552 | #undef ML_Y 553 | #undef ML_Z 554 | #undef ML_W 555 | 556 | //====================================================================================================================== 557 | // Misc 558 | //====================================================================================================================== 559 | 560 | template 561 | ML_INLINE T CurveSmooth(const T& x) { 562 | return x * x * (3.0 - 2.0 * x); 563 | } 564 | 565 | template 566 | ML_INLINE T CurveSin(const T& x) { 567 | return x * (1.0 - x * x / 3.0); 568 | } 569 | 570 | template 571 | ML_INLINE T WaveTriangle(const T& x) { 572 | return abs(frac(x + T(0.5)) * T(2.0) - T(1.0)); 573 | } 574 | 575 | template 576 | ML_INLINE T WaveTriangleSmooth(const T& x) { 577 | return CurveSmooth(WaveTriangle(x)); 578 | } 579 | 580 | ML_INLINE float DoubleToGequal(double dValue) { 581 | float fValue = (float)dValue; 582 | float fError = (float)(dValue - fValue); 583 | 584 | int32_t exponent = 0; 585 | frexp(fValue, &exponent); 586 | exponent = max(exponent, 0); 587 | exponent = (int32_t)log10f(float(1 << exponent)); 588 | 589 | float fStep = 1.0f / pow(10.0f, float(7 - exponent)); 590 | 591 | while (fError > 0.0f) { 592 | fValue += fStep; 593 | 594 | float fCurrError = float(dValue - fValue); 595 | 596 | if (fCurrError == fError) 597 | fStep += fStep; 598 | else 599 | fError = fCurrError; 600 | } 601 | 602 | return fValue; 603 | } 604 | 605 | ML_INLINE float DoubleToLequal(double dValue) { 606 | float fValue = (float)dValue; 607 | float fError = (float)(dValue - fValue); 608 | 609 | int32_t exponent = 0; 610 | frexp(fValue, &exponent); 611 | exponent = max(exponent, 0); 612 | exponent = (int32_t)log10f(float(1 << exponent)); 613 | 614 | float fStep = 1.0f / pow(10.0f, float(7 - exponent)); 615 | 616 | while (fError < 0.0f) { 617 | fValue -= fStep; 618 | 619 | float fCurrError = float(dValue - fValue); 620 | 621 | if (fCurrError == fError) 622 | fStep += fStep; 623 | else 624 | fError = fCurrError; 625 | } 626 | 627 | return fValue; 628 | } 629 | 630 | //====================================================================================================================== 631 | // Rect 632 | //====================================================================================================================== 633 | 634 | template 635 | class ctRect { 636 | public: 637 | union { 638 | struct { 639 | T vMin[COORD_2D]; 640 | }; 641 | 642 | struct { 643 | T minx; 644 | T miny; 645 | }; 646 | }; 647 | 648 | union { 649 | struct { 650 | T vMax[COORD_2D]; 651 | }; 652 | 653 | struct { 654 | T maxx; 655 | T maxy; 656 | }; 657 | }; 658 | 659 | public: 660 | ML_INLINE ctRect() { 661 | Clear(); 662 | } 663 | 664 | ML_INLINE void Clear() { 665 | minx = miny = T(1 << 30); 666 | maxx = maxy = T(-(1 << 30)); 667 | } 668 | 669 | ML_INLINE bool IsValid() const { 670 | return maxx > minx && maxy > miny; 671 | } 672 | 673 | ML_INLINE void Add(T px, T py) { 674 | minx = min(minx, px); 675 | maxx = max(maxx, px); 676 | miny = min(miny, py); 677 | maxy = max(maxy, py); 678 | } 679 | 680 | ML_INLINE void Add(const T* pPoint2) { 681 | Add(pPoint2[0], pPoint2[1]); 682 | } 683 | 684 | ML_INLINE bool IsIntersectWith(const T* pMin, const T* pMax) const { 685 | ML_Assert(IsValid()); 686 | 687 | if (maxx < pMin[0] || maxy < pMin[1] || minx > pMax[0] || miny > pMax[1]) 688 | return false; 689 | 690 | return true; 691 | } 692 | 693 | ML_INLINE bool IsIntersectWith(const ctRect& rRect) const { 694 | return IsIntersectWith(rRect.vMin, rRect.vMax); 695 | } 696 | 697 | ML_INLINE eClip GetIntersectionStateWith(const T* pMin, const T* pMax) const { 698 | ML_Assert(IsValid()); 699 | 700 | if (!IsIntersectWith(pMin, pMax)) 701 | return CLIP_OUT; 702 | 703 | if (minx < pMin[0] && maxx > pMax[0] && miny < pMin[1] && maxy > pMax[1]) 704 | return CLIP_IN; 705 | 706 | return CLIP_PARTIAL; 707 | } 708 | 709 | ML_INLINE eClip GetIntersectionStateWith(const ctRect& rRect) const { 710 | return GetIntersectionStateWith(rRect.vMin, rRect.vMax); 711 | } 712 | }; 713 | 714 | //====================================================================================================================== 715 | // Frustum 716 | //====================================================================================================================== 717 | 718 | ML_INLINE bool MvpToPlanes(eStyle depthStyle, const float4x4& m, float4* pvPlane6) { 719 | const float eps = 1e-7f; 720 | 721 | float4x4 mt; 722 | m.TransposeTo(mt); 723 | 724 | float4 l = mt[3] + mt[0]; 725 | float4 r = mt[3] - mt[0]; 726 | float4 b = mt[3] + mt[1]; 727 | float4 t = mt[3] - mt[1]; 728 | float4 f = mt[3] - mt[2]; 729 | float4 n = mt[2]; 730 | 731 | if (depthStyle == STYLE_OGL) 732 | n += mt[3]; 733 | 734 | // Side planes 735 | l *= rsqrt(dot(l.xyz, l.xyz)); 736 | r *= rsqrt(dot(r.xyz, r.xyz)); 737 | b *= rsqrt(dot(b.xyz, b.xyz)); 738 | t *= rsqrt(dot(t.xyz, t.xyz)); 739 | 740 | // Near & far planes 741 | n /= max(length(n.xyz), eps); 742 | f /= max(length(f.xyz), eps); 743 | 744 | // Handle reversed projection 745 | bool bReversed = abs(n.w) > abs(f.w); 746 | 747 | if (bReversed) 748 | Swap(n, f); 749 | 750 | // Handle infinite projection 751 | if (length(f.xyz) < eps) 752 | f = float4(-n.x, -n.y, -n.z, f.w); 753 | 754 | pvPlane6[PLANE_LEFT] = l; 755 | pvPlane6[PLANE_RIGHT] = r; 756 | pvPlane6[PLANE_BOTTOM] = b; 757 | pvPlane6[PLANE_TOP] = t; 758 | pvPlane6[PLANE_NEAR] = n; 759 | pvPlane6[PLANE_FAR] = f; 760 | 761 | return bReversed; 762 | } 763 | 764 | class cFrustum { 765 | private: 766 | float4 m_vPlane[PLANES_NUM] = {}; 767 | float4x4 m_mPlanesT = {}; 768 | v4f m_vMask[PLANES_NUM] = {}; 769 | 770 | public: 771 | ML_INLINE void Setup(eStyle depthStyle, const float4x4& mMvp) { 772 | MvpToPlanes(depthStyle, mMvp, m_vPlane); 773 | 774 | m_mPlanesT[0] = m_vPlane[PLANE_LEFT]; 775 | m_mPlanesT[1] = m_vPlane[PLANE_RIGHT]; 776 | m_mPlanesT[2] = m_vPlane[PLANE_BOTTOM]; 777 | m_mPlanesT[3] = m_vPlane[PLANE_TOP]; 778 | m_mPlanesT.Transpose(); 779 | 780 | for (uint32_t i = 0; i < PLANES_NUM; i++) 781 | m_vMask[i] = _mm_cmpgt_ps(m_vPlane[i].xmm, _mm_setzero_ps()); 782 | } 783 | 784 | ML_INLINE void Translate(const float3& vPos) { 785 | // Update of m_vMask is not required, because only m_vMask.w can be changed, but this component doesn't affect results 786 | for (uint32_t i = 0; i < PLANES_NUM; i++) 787 | m_vPlane[i].w = Dot43(m_vPlane[i], vPos); 788 | } 789 | 790 | ML_INLINE bool CheckSphere(const float3& center, float fRadius, uint32_t planes = PLANES_NUM) const { 791 | v4f p1 = v4f_setw1(center.xmm); 792 | 793 | for (uint32_t i = 0; i < planes; i++) { 794 | float d = dot(m_vPlane[i], p1); 795 | 796 | if (d < -fRadius) 797 | return false; 798 | } 799 | 800 | return true; 801 | } 802 | 803 | ML_INLINE bool CheckAabb(const float3& minv, const float3& maxv, uint32_t planes) const { 804 | v4f min1 = v4f_setw1(minv.xmm); 805 | v4f max1 = v4f_setw1(maxv.xmm); 806 | 807 | for (uint32_t i = 0; i < planes; i++) { 808 | v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]); 809 | v = v4f_dot44(m_vPlane[i].xmm, v); 810 | 811 | if (v4f_isnegative1_all(v)) 812 | return false; 813 | } 814 | 815 | return true; 816 | } 817 | 818 | ML_INLINE bool CheckCapsule(const float3& capsule_start, const float3& capsule_axis, float capsule_radius, uint32_t planes) const { 819 | // https://github.com/toxygen/STA/blob/master/celestia-src/celmath/frustum.cpp 820 | 821 | float r2 = capsule_radius * capsule_radius; 822 | float3 capsule_end = capsule_start + capsule_axis; 823 | 824 | for (uint32_t i = 0; i < planes; i++) { 825 | float signedDist0 = Dot43(m_vPlane[i], capsule_start); 826 | float signedDist1 = Dot43(m_vPlane[i], capsule_end); 827 | 828 | if (signedDist0 * signedDist1 > r2) { 829 | if (abs(signedDist0) <= abs(signedDist1)) { 830 | if (signedDist0 < -capsule_radius) 831 | return false; 832 | } else { 833 | if (signedDist1 < -capsule_radius) 834 | return false; 835 | } 836 | } 837 | } 838 | 839 | return true; 840 | } 841 | 842 | ML_INLINE bool CheckSphere_mask(const float3& center, float fRadius, uint32_t mask, uint32_t planes) const { 843 | v4f p1 = v4f_setw1(center.xmm); 844 | 845 | for (uint32_t i = 0; i < planes; i++) { 846 | if (!(mask & (1 << i))) { 847 | float d = dot(m_vPlane[i], p1); 848 | 849 | if (d < -fRadius) 850 | return false; 851 | } 852 | } 853 | 854 | return true; 855 | } 856 | 857 | ML_INLINE bool CheckAabb_mask(const float3& minv, const float3& maxv, uint32_t mask, uint32_t planes) const { 858 | v4f min1 = v4f_setw1(minv.xmm); 859 | v4f max1 = v4f_setw1(maxv.xmm); 860 | 861 | for (uint32_t i = 0; i < planes; i++) { 862 | if (!(mask & (1 << i))) { 863 | v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]); 864 | v = v4f_dot44(m_vPlane[i].xmm, v); 865 | 866 | if (v4f_isnegative1_all(v)) 867 | return false; 868 | } 869 | } 870 | 871 | return true; 872 | } 873 | 874 | ML_INLINE eClip CheckSphere_state(const float3& center, float fRadius, uint32_t planes) const { 875 | v4f p1 = v4f_setw1(center.xmm); 876 | 877 | eClip clip = CLIP_IN; 878 | 879 | for (uint32_t i = 0; i < planes; i++) { 880 | float d = dot(m_vPlane[i], p1); 881 | 882 | if (d < -fRadius) 883 | return CLIP_OUT; 884 | 885 | if (d < fRadius) 886 | clip = CLIP_PARTIAL; 887 | } 888 | 889 | return clip; 890 | } 891 | 892 | ML_INLINE eClip CheckAabb_state(const float3& minv, const float3& maxv, uint32_t planes) const { 893 | v4f min1 = v4f_setw1(minv.xmm); 894 | v4f max1 = v4f_setw1(maxv.xmm); 895 | 896 | eClip clip = CLIP_IN; 897 | 898 | for (uint32_t i = 0; i < planes; i++) { 899 | v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]); 900 | v = v4f_dot44(m_vPlane[i].xmm, v); 901 | 902 | if (v4f_isnegative1_all(v)) 903 | return CLIP_OUT; 904 | 905 | v = _mm_blendv_ps(max1, min1, m_vMask[i]); 906 | v = v4f_dot44(m_vPlane[i].xmm, v); 907 | 908 | if (v4f_isnegative1_all(v)) 909 | clip = CLIP_PARTIAL; 910 | } 911 | 912 | return clip; 913 | } 914 | 915 | ML_INLINE eClip CheckCapsule_state(const float3& capsule_start, const float3& capsule_axis, float capsule_radius, uint32_t planes) const { 916 | float r2 = capsule_radius * capsule_radius; 917 | float3 capsule_end = capsule_start + capsule_axis; 918 | 919 | uint32_t intersections = 0; 920 | 921 | for (uint32_t i = 0; i < planes; i++) { 922 | float signedDist0 = Dot43(m_vPlane[i], capsule_start); 923 | float signedDist1 = Dot43(m_vPlane[i], capsule_end); 924 | 925 | if (signedDist0 * signedDist1 > r2) { 926 | // Endpoints of capsule are on same side of plane. Test closest endpoint to see if it lies closer to the plane than radius 927 | if (abs(signedDist0) <= abs(signedDist1)) { 928 | if (signedDist0 < -capsule_radius) 929 | return CLIP_OUT; 930 | else if (signedDist0 < capsule_radius) 931 | intersections |= (1 << i); 932 | } else { 933 | if (signedDist1 < -capsule_radius) 934 | return CLIP_OUT; 935 | else if (signedDist1 < capsule_radius) 936 | intersections |= (1 << i); 937 | } 938 | } else { 939 | // Capsule endpoints are on different sides of the plane, so we have an intersection 940 | intersections |= (1 << i); 941 | } 942 | } 943 | 944 | return !intersections ? CLIP_IN : CLIP_PARTIAL; 945 | } 946 | 947 | ML_INLINE eClip CheckSphere_mask_state(const float3& center, float fRadius, uint32_t& mask, uint32_t planes) const { 948 | v4f p1 = v4f_setw1(center.xmm); 949 | 950 | eClip clip = CLIP_IN; 951 | 952 | for (uint32_t i = 0; i < planes; i++) { 953 | if (!(mask & (1 << i))) { 954 | float d = dot(m_vPlane[i], p1); 955 | 956 | if (d < -fRadius) 957 | return CLIP_OUT; 958 | 959 | if (d < fRadius) 960 | clip = CLIP_PARTIAL; 961 | else 962 | mask |= 1 << i; 963 | } 964 | } 965 | 966 | return clip; 967 | } 968 | 969 | ML_INLINE eClip CheckAabb_mask_state(const float3& minv, const float3& maxv, uint32_t& mask, uint32_t planes) const { 970 | v4f min1 = v4f_setw1(minv.xmm); 971 | v4f max1 = v4f_setw1(maxv.xmm); 972 | 973 | eClip result = CLIP_IN; 974 | 975 | for (uint32_t i = 0; i < planes; i++) { 976 | if (!(mask & (1 << i))) { 977 | v4f v = _mm_blendv_ps(min1, max1, m_vMask[i]); 978 | v = v4f_dot44(m_vPlane[i].xmm, v); 979 | 980 | if (v4f_isnegative1_all(v)) 981 | return CLIP_OUT; 982 | 983 | v = _mm_blendv_ps(max1, min1, m_vMask[i]); 984 | v = v4f_dot44(m_vPlane[i].xmm, v); 985 | 986 | if (v4f_isnegative1_all(v)) 987 | result = CLIP_PARTIAL; 988 | else 989 | mask |= 1 << i; 990 | } 991 | } 992 | 993 | return result; 994 | } 995 | 996 | ML_INLINE void SetNearFar(float zNearNeg, float zFarNeg) { 997 | m_vPlane[PLANE_NEAR].w = zNearNeg; 998 | m_vPlane[PLANE_FAR].w = -zFarNeg; 999 | } 1000 | 1001 | ML_INLINE void SetFar(float zFarNeg) { 1002 | m_vPlane[PLANE_FAR].w = -zFarNeg; 1003 | } 1004 | 1005 | ML_INLINE const float4& GetPlane(uint32_t plane) { 1006 | ML_Assert(plane < PLANES_NUM); 1007 | 1008 | return m_vPlane[plane]; 1009 | } 1010 | }; 1011 | 1012 | ML_INLINE void DecomposeProjection(eStyle originStyle, eStyle depthStyle, const float4x4& proj, uint32_t* puiFlags, float* pfSettings15, float* pfUnproject2, float* pfFrustum4, 1013 | float* pfProject3, float* pfSafeNearZ) { 1014 | float4 vPlane[PLANES_NUM]; 1015 | bool bReversedZ = MvpToPlanes(depthStyle, proj, vPlane); 1016 | 1017 | bool bIsOrtho = proj.a33 == 1.0f ? true : false; 1018 | 1019 | float fNearZ = -vPlane[PLANE_NEAR].w; 1020 | float fFarZ = vPlane[PLANE_FAR].w; 1021 | 1022 | float x0, x1, y0, y1; 1023 | if (bIsOrtho) { 1024 | x0 = -vPlane[PLANE_LEFT].w; 1025 | x1 = vPlane[PLANE_RIGHT].w; 1026 | y0 = -vPlane[PLANE_BOTTOM].w; 1027 | y1 = vPlane[PLANE_TOP].w; 1028 | 1029 | if (proj.a11 < 0.0f) 1030 | Swap(y0, y1); 1031 | } else { 1032 | x0 = vPlane[PLANE_LEFT].z / vPlane[PLANE_LEFT].x; 1033 | x1 = vPlane[PLANE_RIGHT].z / vPlane[PLANE_RIGHT].x; 1034 | y0 = vPlane[PLANE_BOTTOM].z / vPlane[PLANE_BOTTOM].y; 1035 | y1 = vPlane[PLANE_TOP].z / vPlane[PLANE_TOP].y; 1036 | } 1037 | 1038 | // const float3& col2 = bReversedZ ? proj.col3 : proj.col2; 1039 | float4 clip = proj * float4(0.0f, 0.0f, fNearZ, 1.0f); 1040 | float3 col2 = bIsOrtho ? float3(proj.Col(2)) * (bReversedZ ? -1.0f : 1.0f) : float3(0.0f, 0.0f, clip.w > 0.0f ? 1.0f : -1.0f); 1041 | bool cmp = dot(cross(float3(proj.Col(0)), float3(proj.Col(1))), col2.xyz) > 0.0f; 1042 | bool bLeftHanded = proj.a11 > 0.0f ? cmp : !cmp; 1043 | 1044 | if (puiFlags) { 1045 | *puiFlags = bIsOrtho ? PROJ_ORTHO : 0; 1046 | *puiFlags |= bReversedZ ? PROJ_REVERSED_Z : 0; 1047 | *puiFlags |= bLeftHanded ? PROJ_LEFT_HANDED : 0; 1048 | } 1049 | 1050 | if (pfUnproject2) { 1051 | // z = u0 / (depth + u1) 1052 | 1053 | pfUnproject2[0] = ML_DEPTH_C0 * proj.a23 / proj.a32; 1054 | pfUnproject2[1] = -(ML_DEPTH_C0 * proj.a22 / proj.a32 + ML_DEPTH_C1); 1055 | 1056 | // z = 1 / (depth * u0 + u1); 1057 | 1058 | // pfUnproject2[0] = proj.a32 / (ML_DEPTH_C0 * proj.a23); 1059 | // pfUnproject2[1] = -(proj.a22 / proj.a23 + ML_DEPTH_C1 / pfUnproject2[0]); 1060 | } 1061 | 1062 | if (pfSafeNearZ) { 1063 | *pfSafeNearZ = fNearZ - ML_DEPTH_EPS; 1064 | 1065 | if (!bIsOrtho) { 1066 | float maxx = max(abs(x0), abs(x1)); 1067 | float maxy = max(abs(y0), abs(y1)); 1068 | 1069 | *pfSafeNearZ *= sqrt(maxx * maxx + maxy * maxy + 1.0f); 1070 | } 1071 | } 1072 | 1073 | if (pfProject3) { 1074 | // IMPORTANT: Rg - geometry radius, Rp - projected radius, Rn - projected normalized radius 1075 | // keep in mind: 1076 | // zp = -(mView * p).z 1077 | // zp_fix = mix(zp, 1.0, bIsOrtho), or 1078 | // zp_fix = (mViewProj * p).w 1079 | // project: 1080 | // Rn.x = Rg * pfProject3[0] / zp_fix 1081 | // Rn.y = Rg * pfProject3[1] / zp_fix 1082 | // Rp = 0.5 * viewport.w * Rn.x, or 1083 | // Rp = 0.5 * viewport.h * Rn.y, or 1084 | // Rp = Rg * K / zp_fix 1085 | // unproject: 1086 | // Rn.x = 2.0 * Rp / viewport.w 1087 | // Rn.y = 2.0 * Rp / viewport.h 1088 | // Rg = Rn.x * zp_fix / pfProject3[0], or 1089 | // Rg = Rn.y * zp_fix / pfProject3[1], or 1090 | // Rg = Rp * zp_fix / K 1091 | // K = 0.5 * viewport.w * pfProject3[0] = 0.5 * viewport.h * pfProject3[1] 1092 | 1093 | float fProjectx = 2.0f / (x1 - x0); 1094 | float fProjecty = 2.0f / (y1 - y0); 1095 | 1096 | pfProject3[0] = abs(fProjectx); 1097 | pfProject3[1] = abs(fProjecty); 1098 | pfProject3[2] = bIsOrtho ? 1.0f : 0.0f; 1099 | } 1100 | 1101 | if (pfFrustum4) { 1102 | // IMPORTANT: view space position from screen space uv [0, 1] 1103 | // ray.xy = (pfFrustum4.zw * uv + pfFrustum4.xy) * mix(zDistanceNeg, -1.0, bIsOrtho) 1104 | // ray.z = 1.0 * zDistanceNeg 1105 | 1106 | pfFrustum4[0] = -x0; 1107 | pfFrustum4[2] = x0 - x1; 1108 | 1109 | if (originStyle == STYLE_D3D) { 1110 | pfFrustum4[1] = -y1; 1111 | pfFrustum4[3] = y1 - y0; 1112 | } else { 1113 | pfFrustum4[1] = -y0; 1114 | pfFrustum4[3] = y0 - y1; 1115 | } 1116 | } 1117 | 1118 | if (pfSettings15) { 1119 | // Swap is possible, because it is the last pass... 1120 | if (x0 > x1) 1121 | Swap(x0, x1); 1122 | 1123 | if (y0 > y1) 1124 | Swap(y0, y1); 1125 | 1126 | float fAngleY0 = atan(bIsOrtho ? 0.0f : y0); 1127 | float fAngleY1 = atan(bIsOrtho ? 0.0f : y1); 1128 | float fAngleX0 = atan(bIsOrtho ? 0.0f : x0); 1129 | float fAngleX1 = atan(bIsOrtho ? 0.0f : x1); 1130 | 1131 | float fAspect = (x1 - x0) / (y1 - y0); 1132 | 1133 | pfSettings15[PROJ_ZNEAR] = fNearZ; 1134 | pfSettings15[PROJ_ZFAR] = fFarZ; 1135 | pfSettings15[PROJ_ASPECT] = fAspect; 1136 | pfSettings15[PROJ_FOVX] = fAngleX1 - fAngleX0; 1137 | pfSettings15[PROJ_FOVY] = fAngleY1 - fAngleY0; 1138 | pfSettings15[PROJ_MINX] = x0 * fNearZ; 1139 | pfSettings15[PROJ_MAXX] = x1 * fNearZ; 1140 | pfSettings15[PROJ_MINY] = y0 * fNearZ; 1141 | pfSettings15[PROJ_MAXY] = y1 * fNearZ; 1142 | pfSettings15[PROJ_ANGLEMINX] = fAngleX0; 1143 | pfSettings15[PROJ_ANGLEMAXX] = fAngleX1; 1144 | pfSettings15[PROJ_ANGLEMINY] = fAngleY0; 1145 | pfSettings15[PROJ_ANGLEMAXY] = fAngleY1; 1146 | pfSettings15[PROJ_DIRX] = (fAngleX0 + fAngleX1) * 0.5f; 1147 | pfSettings15[PROJ_DIRY] = (fAngleY0 + fAngleY1) * 0.5f; 1148 | } 1149 | } 1150 | 1151 | #include "Guts/other.h" 1152 | #include "Guts/packing.h" 1153 | #include "Guts/sorting.h" 1154 | 1155 | #ifdef ML_NAMESPACE 1156 | } 1157 | #endif 1158 | 1159 | //====================================================================================================================== 1160 | // End 1161 | //====================================================================================================================== 1162 | 1163 | #if defined(__GNUC__) 1164 | # pragma GCC diagnostic pop 1165 | #elif defined(__clang__) 1166 | # pragma clang diagnostic pop 1167 | #else 1168 | # pragma warning(pop) 1169 | #endif 1170 | --------------------------------------------------------------------------------