├── .clang-format ├── .gitignore ├── .gitmodules ├── LICENSE.md ├── Makefile ├── README.md ├── packages.config ├── phyx.sln ├── phyx.vcxproj ├── phyx.vcxproj.filters └── src ├── AABB2.h ├── Collider.cpp ├── Collider.h ├── Configuration.h ├── Coords2.h ├── Geom.h ├── Joints.h ├── Manifold.h ├── RigidBody.h ├── Solver.cpp ├── Solver.h ├── Vector2.h ├── World.cpp ├── World.h ├── base ├── AlignedArray.h ├── DenseHash.h ├── Parallel.h ├── RadixSort.h ├── SIMD.h ├── SIMD_AVX2.h ├── SIMD_AVX2_Transpose.h ├── SIMD_SSE2.h ├── SIMD_Scalar.h ├── WorkQueue.cpp ├── WorkQueue.h └── microprofile.cpp ├── glad ├── glad.c ├── glad.h └── khrplatform.h └── main.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | UseTab: Never 2 | IndentWidth: 4 3 | BreakBeforeBraces: Allman 4 | AllowShortIfStatementsOnASingleLine: true 5 | IndentCaseLabels: false 6 | ColumnLimit: 0 7 | PointerAlignment: Left 8 | BreakConstructorInitializersBeforeComma: true -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .microprofilepreset.* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/microprofile"] 2 | path = src/microprofile 3 | url = git@github.com:zeux/microprofile.git 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Alexander Sannikov, Arseny Kapoulkine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .SUFFIXES: 2 | MAKEFLAGS+=-r 3 | 4 | BUILD=build 5 | 6 | SOURCES=$(wildcard src/*.cpp src/base/*.cpp) 7 | OBJECTS=$(SOURCES:%=$(BUILD)/%.o) 8 | 9 | EXECUTABLE=$(BUILD)/phyx 10 | 11 | CXXFLAGS=-g -Wall -std=c++11 -O3 -DNDEBUG -ffast-math -Isrc/microprofile 12 | LDFLAGS= 13 | 14 | ifeq ($(shell uname),Darwin) 15 | CXXFLAGS+=-mavx2 -mfma 16 | LDFLAGS+=-lglfw3 -framework OpenGL 17 | else 18 | CPUINFO=$(shell cat /proc/cpuinfo) 19 | ifneq ($(findstring avx2,$(CPUINFO)),) 20 | CXXFLAGS+=-mavx2 21 | endif 22 | ifneq ($(findstring fma,$(CPUINFO)),) 23 | CXXFLAGS+=-mfma 24 | endif 25 | LDFLAGS+=-lglfw -lGL -lpthread 26 | endif 27 | 28 | all: $(EXECUTABLE) 29 | ./$(EXECUTABLE) 30 | 31 | $(EXECUTABLE): $(OBJECTS) 32 | $(CXX) $(OBJECTS) $(LDFLAGS) -o $@ 33 | 34 | $(BUILD)/%.o: % 35 | @mkdir -p $(dir $@) 36 | $(CXX) $< $(CXXFLAGS) -c -MMD -MP -o $@ 37 | 38 | -include $(OBJECTS:.o=.d) 39 | clean: 40 | rm -rf $(BUILD) 41 | 42 | .PHONY: all clean 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PhyX 2 | 3 | ## Overview 4 | 5 | This is a 2D physics engine with SoA/SIMD/multicore optimizations. The primary purpose of this project is to experiment with various multi-threading and SIMD algorithms - it is not meant to be a production-ready 2D physics engine. 6 | 7 | The engine is based on SusliX by Alexander Sannikov aka Suslik <i.make.physics at gmail.com> \ 8 | The optimizations are implemented by Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 9 | 10 | The code is licensed under the MIT license. 11 | 12 | ## Video 13 | 14 | [![Massive Rigid Body Simulation](https://img.youtube.com/vi/2-UZkEjnBu4/0.jpg)](https://www.youtube.com/watch?v=2-UZkEjnBu4) 15 | 16 | ## Building 17 | 18 | To build the project, you first have to clone it using `--recursive` option to fetch the [microprofile](https://github.com/zeux/microprofile) submodule. 19 | 20 | On Linux/Mac, use `make` to build the project, and `make run` to run the demo. Makefile automatically picks the optimal compilation settings for the host system, using AVX2/FMA as appropriate. 21 | 22 | On Windows, open `phyx.sln` in Visual Studio 2017 and build & run from there. Note that the project file is configured to assume AVX2 support; if your system doesn't have AVX2, you will need to disable it by removing `__AVX2__` from preprocessor defines and switching the code generation instruction set to "Streaming SIMD Extensions 2 (/arch:SSE2)" - both modifications can be done in project properties. 23 | 24 | ## Features 25 | 26 | The engine implements a traditional physics pipeline - broadphase, narrowphase, contact pairing/caching, island splitting, solve (using sequential impulses). 27 | 28 | There is only one collision primitive - box. It should be straightforward to add support for more primitives, as long as each manifold has a small number of contact points. 29 | 30 | There is only one constraint type - contact. It should be possible to add support for more constraint types, although that might require extensive changes to some algorithms, in particular SIMD. 31 | 32 | The broadphase algorithm is single-axis sweep&prune, using non-incremental 3-pass radix sort. Incremental algorithms tend to be expensive when a lot of updates are performed, and sweep&prune is a good fit for 2D assuming the worlds are mostly horizontal. 33 | 34 | ## Controls 35 | 36 | In the demo you can use several keys to switch between various modes and cycle between scenes: 37 | 38 | * Left/Right: move the viewport 39 | * Up/Down: scale the viewport 40 | * S: Switch to the next scene; different scenes test different configurations of bodies, stress testing various parts of the pipeline 41 | * R: Reset current scene 42 | * I: Switch island mode (see below) 43 | * M: Switch solve mode (see below) 44 | * C: Switch the number of cores the solver uses (1, 2, 4, 8, etc. - up to the number of logical core counts on the machine) 45 | * P: Pause simulation 46 | * O: Cycle between various microprofile display views 47 | 48 | ## Island splitting 49 | 50 | You can switch between different island construction modes by using the `I` key: 51 | 52 | * Single: no island splitting is performed, constraint solving is effectively single-threaded. 53 | * Multiple: objects are split into islands, each island is solved serially. Island splitting can be expensive for complex constraint graphs. 54 | * Single Sloppy: no island splitting is performed, constraint solving is multi-threaded. Each internal solve step is serialized, which makes sure that - barring rare race conditions - impulse propagation is still effective. 55 | * Multiple Sloppy: objects are split into islands, constraing solving within one island is multi-threaded. Compared to Single Sloppy, requires (potentially expensive) island splitting, but preserves mechanism integrity for small islands. 56 | 57 | ## SIMD 58 | 59 | The code is using a custom SIMD library and templated code that enables SIMD computations with both SSE2 (4-wide) and AVX2 (8-wide) with the same codebase. You can switch between different SIMD widths - 1, 4, 8 - using the `M` key. 60 | 61 | The library interface is structured to make it easy to write complex algebraic code, including conditions: 62 | 63 | ```c++ 64 | Vf dv = -bounce * (relativeVelocityX * collision_normalX + relativeVelocityY * collision_normalY); 65 | Vf depth = (point2X - point1X) * collision_normalX + (point2Y - point1Y) * collision_normalY; 66 | 67 | Vf dstVelocity = max(dv - deltaVelocity, Vf::zero()); 68 | 69 | Vf j_normalLimiter_dstVelocity = select(dstVelocity, dstVelocity - maxPenetrationVelocity, depth < deltaDepth); 70 | Vf j_normalLimiter_dstDisplacingVelocity = errorReduction * max(Vf::zero(), depth - Vf::one(2.0f) * deltaDepth); 71 | Vf j_normalLimiter_accumulatedDisplacingImpulse = Vf::zero(); 72 | ``` 73 | 74 | To be able to efficiently use SIMD, we split islands into groups of N independent constraints (that affect 2\*N bodies), where N is the SIMD width. The constraint data is packed into AoSoA arrays (array of structure of arrays), otherwise known as block SoA where the block size matches SIMD width and each field of each vector is scalarized so that we can efficiently load and store them without a need to transpose. This structure is maintained throughout all internal iterations of the solver. 75 | 76 | ## Threading 77 | 78 | The code is using a pretty standard thread pool implementation with a single queue for items. This thread pool is used for distributing work across worker threads - the batches of work are very large to compensate for the inefficiency of locking/notification mechanisms, for example "one island" or "all contact points". 79 | 80 | On top of that we implement a `parallelFor` primitive that uses a shared atomic variable to concurrently process a large array of items. To reduce contention for the shared atomic, the arrays are split into small groups for batched processing. The code for using parallel for looks like this: 81 | 82 | ```c++ 83 | parallelFor(queue, 0, jointEnd - jointBegin, 128, [&](int i, int) { 84 | ContactJoint& joint = contactJoints[joint_index[jointBegin + i]]; 85 | 86 | ContactJointPacked& jointP = joint_packed[unsigned(jointBegin + i) / N]; 87 | int iP = i & (N - 1); 88 | 89 | jointP.body1Index[iP] = joint.body1Index; 90 | jointP.body2Index[iP] = joint.body2Index; 91 | jointP.contactPointIndex[iP] = joint.contactPointIndex; 92 | 93 | jointP.normalLimiter_accumulatedImpulse[iP] = joint.normalLimiter_accumulatedImpulse; 94 | jointP.frictionLimiter_accumulatedImpulse[iP] = joint.frictionLimiter_accumulatedImpulse; 95 | }); 96 | ``` 97 | 98 | A bit of care is required to find optimal group size for each for loop; additionally, for small arrays it might be worthwhile to implement a serial path - although in case of the code above, `parallelFor` automatically serializes execution if there is only one group - that is, if the number of joints (`jointEnd - jointBegin`) is less than or equal to 128. 99 | -------------------------------------------------------------------------------- /packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | -------------------------------------------------------------------------------- /phyx.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.106 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "phyx", "phyx.vcxproj", "{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x64.ActiveCfg = Debug|x64 17 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x64.Build.0 = Debug|x64 18 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x86.ActiveCfg = Debug|Win32 19 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x86.Build.0 = Debug|Win32 20 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x64.ActiveCfg = Release|x64 21 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x64.Build.0 = Release|x64 22 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x86.ActiveCfg = Release|Win32 23 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {9762CF5D-3DD5-4853-8CF4-4B05F956CFB0} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /phyx.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 15.0 23 | {40C91DE2-9442-45F0-8BB5-DC8CC4123A3E} 24 | phyx 25 | 10.0.17763.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v141 32 | MultiByte 33 | 34 | 35 | Application 36 | false 37 | v141 38 | true 39 | MultiByte 40 | 41 | 42 | Application 43 | true 44 | v141 45 | MultiByte 46 | 47 | 48 | Application 49 | false 50 | v141 51 | true 52 | MultiByte 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | Level3 76 | Disabled 77 | true 78 | true 79 | src/microprofile 80 | _CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;%(PreprocessorDefinitions) 81 | Fast 82 | AdvancedVectorExtensions2 83 | 84 | 85 | opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 86 | 87 | 88 | 89 | 90 | Level3 91 | Disabled 92 | true 93 | true 94 | src/microprofile 95 | _CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;%(PreprocessorDefinitions) 96 | Fast 97 | AdvancedVectorExtensions2 98 | 99 | 100 | opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 101 | 102 | 103 | 104 | 105 | Level3 106 | MaxSpeed 107 | true 108 | true 109 | true 110 | src/microprofile 111 | _CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;NDEBUG;%(PreprocessorDefinitions) 112 | Fast 113 | AdvancedVectorExtensions2 114 | false 115 | 116 | 117 | true 118 | true 119 | opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 120 | 121 | 122 | 123 | 124 | Level3 125 | MaxSpeed 126 | true 127 | true 128 | true 129 | src/microprofile 130 | _CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;NDEBUG;%(PreprocessorDefinitions) 131 | Fast 132 | AdvancedVectorExtensions2 133 | false 134 | 135 | 136 | true 137 | true 138 | opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /phyx.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {cc5792bf-4514-426c-b977-98bb3782e085} 10 | 11 | 12 | {40ef955c-c1c2-4774-ad98-2e48043cd809} 13 | 14 | 15 | {c55d19a8-61f0-42bf-a230-4f8179c9f303} 16 | 17 | 18 | 19 | 20 | src 21 | 22 | 23 | src 24 | 25 | 26 | src 27 | 28 | 29 | src 30 | 31 | 32 | src 33 | 34 | 35 | src 36 | 37 | 38 | src 39 | 40 | 41 | src 42 | 43 | 44 | src 45 | 46 | 47 | src 48 | 49 | 50 | src 51 | 52 | 53 | src\microprofile 54 | 55 | 56 | src\microprofile 57 | 58 | 59 | src\microprofile 60 | 61 | 62 | src\microprofile 63 | 64 | 65 | src\base 66 | 67 | 68 | src\base 69 | 70 | 71 | src\base 72 | 73 | 74 | src\base 75 | 76 | 77 | src\base 78 | 79 | 80 | src\base 81 | 82 | 83 | src\base 84 | 85 | 86 | src\base 87 | 88 | 89 | src\base 90 | 91 | 92 | src\base 93 | 94 | 95 | src\glad 96 | 97 | 98 | 99 | 100 | src 101 | 102 | 103 | src 104 | 105 | 106 | src 107 | 108 | 109 | src 110 | 111 | 112 | src\base 113 | 114 | 115 | src\base 116 | 117 | 118 | src\glad 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/AABB2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | class AABB2 7 | { 8 | public: 9 | AABB2() 10 | { 11 | Reset(); 12 | } 13 | AABB2(const Vector2& _boxPoint1, const Vector2& _boxPoint2) 14 | { 15 | Set(_boxPoint1, _boxPoint2); 16 | } 17 | 18 | bool Intersects(const AABB2& aabb) const 19 | { 20 | if ((boxPoint1.x > aabb.boxPoint2.x) || (aabb.boxPoint1.x > boxPoint2.x)) return 0; 21 | if ((boxPoint1.y > aabb.boxPoint2.y) || (aabb.boxPoint1.y > boxPoint2.y)) return 0; 22 | return 1; 23 | } 24 | 25 | template 26 | bool Intersects(Vector2 rayStart, Vector2 rayDir, float& paramMin, float& paramMax) 27 | { 28 | // r.dir is unit direction vector of ray 29 | Vector2f invDir(1.0f / rayDir.x, 1.0f / rayDir.y); 30 | 31 | // lb is the corner of AABB with minimal coordinates - left bottom, rt is maximal corner 32 | // r.org is origin of ray 33 | float t1 = (boxPoint1.x - rayStart.x) * invDir.x; 34 | float t2 = (boxPoint2.x - rayStart.x) * invDir.x; 35 | float t3 = (boxPoint1.y - rayStart.y) * invDir.y; 36 | float t4 = (boxPoint2.y - rayStart.y) * invDir.y; 37 | 38 | paramMin = std::max(std::min(t1, t2), std::min(t3, t4)); 39 | paramMax = std::min(std::max(t1, t2), std::max(t3, t4)); 40 | 41 | // if tmax < 0, ray (line) is intersecting AABB, but whole AABB is behing us 42 | if (isFiniteCast && paramMax < 0) 43 | { 44 | return false; 45 | } 46 | 47 | // if tmin > tmax, ray doesn't intersect AABB 48 | if (paramMin > paramMax) 49 | { 50 | return false; 51 | } 52 | return true; 53 | } 54 | bool Includes(const Vector2& point) const 55 | { 56 | if ((point.x < boxPoint1.x) || (point.x > boxPoint2.x) || 57 | (point.y < boxPoint1.y) || (point.y > boxPoint2.y)) 58 | { 59 | return 0; 60 | } 61 | return 1; 62 | } 63 | bool Includes(const AABB2& aabb) const 64 | { 65 | return Includes(aabb.boxPoint1) && Includes(aabb.boxPoint2); 66 | } 67 | void Set(const Vector2& _boxPoint1, const Vector2& _boxPoint2) 68 | { 69 | boxPoint1 = _boxPoint1; 70 | boxPoint2 = _boxPoint2; 71 | } 72 | void Reset() 73 | { 74 | boxPoint1 = Vector2::zeroVector(); 75 | boxPoint2 = Vector2::zeroVector(); 76 | } 77 | void Expand(const Vector2& additionalPoint) 78 | { 79 | boxPoint1.x = Min(boxPoint1.x, additionalPoint.x); 80 | boxPoint1.y = Min(boxPoint1.y, additionalPoint.y); 81 | 82 | boxPoint2.x = Max(boxPoint2.x, additionalPoint.x); 83 | boxPoint2.y = Max(boxPoint2.y, additionalPoint.y); 84 | } 85 | void Expand(const AABB2& internalAABB) 86 | { 87 | Expand(internalAABB.boxPoint1); 88 | Expand(internalAABB.boxPoint2); 89 | } 90 | void Expand(const T mult) 91 | { 92 | Vector2 size = boxPoint2 - boxPoint1; 93 | boxPoint1 -= size * (mult - T(1.0)) * T(0.5); 94 | boxPoint2 += size * (mult - T(1.0)) * T(0.5); 95 | } 96 | const T Square() //perimeter actually, used for AABBTree balancing 97 | { 98 | Vector2 size = boxPoint2 - boxPoint1; 99 | return (size.x + size.y) * T(2.0); 100 | } 101 | Vector2 GetSize() 102 | { 103 | return boxPoint2 - boxPoint1; 104 | } 105 | Vector2 boxPoint1, boxPoint2; 106 | }; 107 | 108 | typedef AABB2 AABB2f; 109 | typedef AABB2 AABB2d; 110 | -------------------------------------------------------------------------------- /src/Collider.cpp: -------------------------------------------------------------------------------- 1 | #include "Collider.h" 2 | 3 | #include "base/Parallel.h" 4 | #include "base/RadixSort.h" 5 | 6 | #include "microprofile.h" 7 | 8 | static NOINLINE bool ComputeSeparatingAxis(RigidBody* body1, RigidBody* body2, Vector2f& separatingAxis) 9 | { 10 | // http://www.geometrictools.com/Source/Intersection2D.html#PlanarPlanar 11 | // Adapted to return axis with least amount of penetration 12 | const Vector2f* A0 = &body1->coords.xVector; 13 | const Vector2f* A1 = &body2->coords.xVector; 14 | const Vector2f& E0 = body1->geom.size; 15 | const Vector2f& E1 = body2->geom.size; 16 | 17 | Vector2f D = body1->coords.pos - body2->coords.pos; 18 | 19 | float Adot[2][2]; 20 | 21 | // Test axis box0.axis[0] 22 | Adot[0][0] = fabsf(A0[0] * A1[0]); 23 | Adot[0][1] = fabsf(A0[0] * A1[1]); 24 | 25 | float rSum0 = E0.x + E1.x * Adot[0][0] + E1.y * Adot[0][1]; 26 | float dist0 = fabsf(A0[0] * D) - rSum0; 27 | if (dist0 > 0) return false; 28 | 29 | float bestdist = dist0; 30 | Vector2f bestaxis = A0[0]; 31 | 32 | // Test axis box0.axis[1]. 33 | Adot[1][0] = fabsf(A0[1] * A1[0]); 34 | Adot[1][1] = fabsf(A0[1] * A1[1]); 35 | 36 | float rSum1 = E0.y + E1.x * Adot[1][0] + E1.y * Adot[1][1]; 37 | float dist1 = fabsf(A0[1] * D) - rSum1; 38 | if (dist1 > 0) return false; 39 | if (dist1 > bestdist) { bestdist = dist1; bestaxis = A0[1]; } 40 | 41 | // Test axis box1.axis[0]. 42 | float rSum2 = E1.x + E0.x * Adot[0][0] + E0.y * Adot[1][0]; 43 | float dist2 = fabsf(A1[0] * D) - rSum2; 44 | if (dist2 > 0) return false; 45 | if (dist2 > bestdist) { bestdist = dist2; bestaxis = A1[0]; } 46 | 47 | // Test axis box1.axis[1]. 48 | float rSum3 = E1.y + E0.x * Adot[0][1] + E0.y * Adot[1][1]; 49 | float dist3 = fabsf(A1[1] * D) - rSum3; 50 | if (dist3 > 0) return false; 51 | if (dist3 > bestdist) { bestdist = dist3; bestaxis = A1[1]; } 52 | 53 | separatingAxis = bestaxis; 54 | return true; 55 | } 56 | 57 | static void AddPoint(ContactPoint* points, int& pointCount, ContactPoint& newbie) 58 | { 59 | ContactPoint* closest = 0; 60 | float bestdepth = std::numeric_limits::max(); 61 | 62 | for (int collisionIndex = 0; collisionIndex < pointCount; collisionIndex++) 63 | { 64 | ContactPoint& col = points[collisionIndex]; 65 | 66 | if (newbie.Equals(col, 2.0f)) 67 | { 68 | float depth = (newbie.delta1 - col.delta1).SquareLen() + (newbie.delta2 - col.delta2).SquareLen(); 69 | if (depth < bestdepth) 70 | { 71 | bestdepth = depth; 72 | closest = &col; 73 | } 74 | } 75 | } 76 | 77 | if (closest) 78 | { 79 | closest->isMerged = 1; 80 | closest->isNewlyCreated = 0; 81 | closest->normal = newbie.normal; 82 | closest->delta1 = newbie.delta1; 83 | closest->delta2 = newbie.delta2; 84 | } 85 | else 86 | { 87 | assert(pointCount < 4); 88 | newbie.isMerged = 1; 89 | newbie.isNewlyCreated = 1; 90 | points[pointCount++] = newbie; 91 | } 92 | } 93 | 94 | static void NOINLINE GenerateContacts(RigidBody* body1, RigidBody* body2, ContactPoint* points, int& pointCount, Vector2f separatingAxis) 95 | { 96 | if (separatingAxis * (body1->coords.pos - body2->coords.pos) < 0.0f) 97 | separatingAxis.Invert(); 98 | 99 | const int kMaxSupportPoints = 2; 100 | Vector2f supportPoints1[kMaxSupportPoints]; 101 | Vector2f supportPoints2[kMaxSupportPoints]; 102 | 103 | float linearTolerance = 2.0f; 104 | 105 | int supportPointsCount1 = body1->geom.GetSupportPointSet(-separatingAxis, supportPoints1); 106 | int supportPointsCount2 = body2->geom.GetSupportPointSet(separatingAxis, supportPoints2); 107 | 108 | if ((supportPointsCount1 == 2) && (((supportPoints1[0] - supportPoints1[1])).SquareLen() < linearTolerance * linearTolerance)) 109 | { 110 | supportPoints1[0] = (supportPoints1[0] + supportPoints1[1]) * 0.5f; 111 | supportPointsCount1 = 1; 112 | } 113 | if ((supportPointsCount2 == 2) && (((supportPoints2[0] - supportPoints2[1])).SquareLen() < linearTolerance * linearTolerance)) 114 | { 115 | supportPoints2[0] = (supportPoints2[0] + supportPoints2[1]) * 0.5f; 116 | supportPointsCount2 = 1; 117 | } 118 | 119 | if ((supportPointsCount1 == 1) && (supportPointsCount2 == 1)) 120 | { 121 | Vector2f delta = supportPoints2[0] - supportPoints1[0]; 122 | //float eps = (delta ^ separatingAxis).SquareLen(); 123 | if (delta * separatingAxis >= 0.0f) 124 | { 125 | ContactPoint newbie(supportPoints1[0], supportPoints2[0], separatingAxis, body1, body2); 126 | AddPoint(points, pointCount, newbie); 127 | } 128 | } 129 | else if ((supportPointsCount1 == 1) && (supportPointsCount2 == 2)) 130 | { 131 | Vector2f n = (supportPoints2[1] - supportPoints2[0]).GetPerpendicular(); 132 | Vector2f point; 133 | ProjectPointToLine(supportPoints1[0], supportPoints2[0], n, separatingAxis, point); 134 | 135 | if ((((point - supportPoints2[0]) * (supportPoints2[1] - supportPoints2[0])) >= 0.0f) && 136 | (((point - supportPoints2[1]) * (supportPoints2[0] - supportPoints2[1])) >= 0.0f)) 137 | { 138 | ContactPoint newbie(supportPoints1[0], point, separatingAxis, body1, body2); 139 | AddPoint(points, pointCount, newbie); 140 | } 141 | } 142 | else if ((supportPointsCount1 == 2) && (supportPointsCount2 == 1)) 143 | { 144 | Vector2f n = (supportPoints1[1] - supportPoints1[0]).GetPerpendicular(); 145 | Vector2f point; 146 | ProjectPointToLine(supportPoints2[0], supportPoints1[0], n, separatingAxis, point); 147 | 148 | if ((((point - supportPoints1[0]) * (supportPoints1[1] - supportPoints1[0])) >= 0.0f) && 149 | (((point - supportPoints1[1]) * (supportPoints1[0] - supportPoints1[1])) >= 0.0f)) 150 | { 151 | ContactPoint newbie(point, supportPoints2[0], separatingAxis, body1, body2); 152 | AddPoint(points, pointCount, newbie); 153 | } 154 | } 155 | else if ((supportPointsCount2 == 2) && (supportPointsCount1 == 2)) 156 | { 157 | struct TempColInfo 158 | { 159 | Vector2f point1, point2; 160 | }; 161 | TempColInfo tempCol[4]; 162 | int tempCols = 0; 163 | for (int i = 0; i < 2; i++) 164 | { 165 | Vector2f n = (supportPoints2[1] - supportPoints2[0]).GetPerpendicular(); 166 | if ((supportPoints1[i] - supportPoints2[0]) * n >= 0.0) 167 | { 168 | Vector2f point; 169 | ProjectPointToLine(supportPoints1[i], supportPoints2[0], n, separatingAxis, point); 170 | 171 | if ((((point - supportPoints2[0]) * (supportPoints2[1] - supportPoints2[0])) >= 0.0f) && 172 | (((point - supportPoints2[1]) * (supportPoints2[0] - supportPoints2[1])) >= 0.0f)) 173 | { 174 | tempCol[tempCols].point1 = supportPoints1[i]; 175 | tempCol[tempCols].point2 = point; 176 | tempCols++; 177 | } 178 | } 179 | } 180 | for (int i = 0; i < 2; i++) 181 | { 182 | Vector2f n = (supportPoints1[1] - supportPoints1[0]).GetPerpendicular(); 183 | if ((supportPoints2[i] - supportPoints1[0]) * n >= 0.0) 184 | { 185 | Vector2f point; 186 | ProjectPointToLine(supportPoints2[i], supportPoints1[0], n, separatingAxis, point); 187 | 188 | if ((((point - supportPoints1[0]) * (supportPoints1[1] - supportPoints1[0])) >= 0.0f) && 189 | (((point - supportPoints1[1]) * (supportPoints1[0] - supportPoints1[1])) >= 0.0f)) 190 | { 191 | tempCol[tempCols].point1 = point; 192 | tempCol[tempCols].point2 = supportPoints2[i]; 193 | tempCols++; 194 | } 195 | } 196 | } 197 | 198 | if (tempCols == 1) //buggy but must work 199 | { 200 | ContactPoint newbie(tempCol[0].point1, tempCol[0].point2, separatingAxis, body1, body2); 201 | AddPoint(points, pointCount, newbie); 202 | } 203 | if (tempCols >= 2) //means only equality, but clamp to two points 204 | { 205 | ContactPoint newbie1(tempCol[0].point1, tempCol[0].point2, separatingAxis, body1, body2); 206 | AddPoint(points, pointCount, newbie1); 207 | ContactPoint newbie2(tempCol[1].point1, tempCol[1].point2, separatingAxis, body1, body2); 208 | AddPoint(points, pointCount, newbie2); 209 | } 210 | } 211 | } 212 | 213 | static void UpdateManifold(Manifold& m, RigidBody* bodies, ContactPoint* points) 214 | { 215 | ContactPoint newpoints[kMaxContactPoints * 2]; 216 | 217 | for (int collisionIndex = 0; collisionIndex < m.pointCount; collisionIndex++) 218 | { 219 | newpoints[collisionIndex] = points[collisionIndex]; 220 | newpoints[collisionIndex].isMerged = 0; 221 | newpoints[collisionIndex].isNewlyCreated = 0; 222 | } 223 | 224 | int newPointCount = m.pointCount; 225 | 226 | RigidBody* body1 = &bodies[m.body1Index]; 227 | RigidBody* body2 = &bodies[m.body2Index]; 228 | 229 | Vector2f separatingAxis; 230 | if (ComputeSeparatingAxis(body1, body2, separatingAxis)) 231 | { 232 | GenerateContacts(body1, body2, newpoints, newPointCount, separatingAxis); 233 | } 234 | 235 | m.pointCount = 0; 236 | 237 | for (int collisionIndex = 0; collisionIndex < newPointCount; ++collisionIndex) 238 | { 239 | if (newpoints[collisionIndex].isMerged) 240 | { 241 | assert(m.pointCount < kMaxContactPoints); 242 | points[m.pointCount++] = newpoints[collisionIndex]; 243 | } 244 | } 245 | } 246 | 247 | Collider::Collider() 248 | { 249 | } 250 | 251 | NOINLINE void Collider::UpdateBroadphase(RigidBody* bodies, size_t bodiesCount) 252 | { 253 | MICROPROFILE_SCOPEI("Physics", "UpdateBroadphase", -1); 254 | 255 | broadphase.resize(bodiesCount); 256 | broadphaseSort[0].resize(bodiesCount); 257 | broadphaseSort[1].resize(bodiesCount); 258 | 259 | for (size_t bodyIndex = 0; bodyIndex < bodiesCount; ++bodyIndex) 260 | { 261 | const AABB2f& aabb = bodies[bodyIndex].geom.aabb; 262 | 263 | broadphaseSort[0][bodyIndex].value = radixFloat(aabb.boxPoint1.x); 264 | broadphaseSort[0][bodyIndex].index = bodyIndex; 265 | } 266 | 267 | radixSort3(broadphaseSort[0].data, broadphaseSort[1].data, bodiesCount, [](const BroadphaseSortEntry& e) { return e.value; }); 268 | 269 | for (size_t i = 0; i < bodiesCount; ++i) 270 | { 271 | unsigned int bodyIndex = broadphaseSort[1][i].index; 272 | 273 | const AABB2f& aabb = bodies[bodyIndex].geom.aabb; 274 | 275 | BroadphaseEntry e = 276 | { 277 | aabb.boxPoint1.x, aabb.boxPoint2.x, 278 | (aabb.boxPoint1.y + aabb.boxPoint2.y) * 0.5f, 279 | (aabb.boxPoint2.y - aabb.boxPoint1.y) * 0.5f, 280 | unsigned(bodyIndex)}; 281 | 282 | broadphase[i] = e; 283 | } 284 | } 285 | 286 | NOINLINE void Collider::UpdatePairs(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount) 287 | { 288 | assert(bodiesCount == broadphase.size); 289 | 290 | if (queue.getWorkerCount() == 0) 291 | UpdatePairsSerial(bodies, bodiesCount); 292 | else 293 | UpdatePairsParallel(queue, bodies, bodiesCount); 294 | } 295 | 296 | NOINLINE void Collider::UpdatePairsSerial(RigidBody* bodies, size_t bodiesCount) 297 | { 298 | MICROPROFILE_SCOPEI("Physics", "UpdatePairsSerial", -1); 299 | 300 | for (size_t bodyIndex1 = 0; bodyIndex1 < bodiesCount; bodyIndex1++) 301 | { 302 | const BroadphaseEntry& be1 = broadphase[bodyIndex1]; 303 | float maxx = be1.maxx; 304 | 305 | for (size_t bodyIndex2 = bodyIndex1 + 1; bodyIndex2 < bodiesCount; bodyIndex2++) 306 | { 307 | const BroadphaseEntry& be2 = broadphase[bodyIndex2]; 308 | if (be2.minx > maxx) 309 | break; 310 | 311 | if (fabsf(be2.centery - be1.centery) <= be1.extenty + be2.extenty) 312 | { 313 | if (manifoldMap.insert(std::make_pair(be1.index, be2.index))) 314 | { 315 | manifolds.push_back(Manifold(be1.index, be2.index, manifolds.size * kMaxContactPoints)); 316 | } 317 | } 318 | } 319 | } 320 | } 321 | 322 | NOINLINE void Collider::UpdatePairsParallel(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount) 323 | { 324 | MICROPROFILE_SCOPEI("Physics", "UpdatePairsParallel", -1); 325 | 326 | manifoldBuffers.resize(queue.getWorkerCount() + 1); 327 | 328 | for (auto& buf: manifoldBuffers) 329 | buf.pairs.clear(); 330 | 331 | parallelFor(queue, 0, bodiesCount, 128, [this, bodies, bodiesCount](int bodyIndex1, int worker) { 332 | UpdatePairsOne(bodies, bodyIndex1, bodyIndex1 + 1, bodiesCount, manifoldBuffers[worker]); 333 | }); 334 | 335 | MICROPROFILE_SCOPEI("Physics", "CreateManifolds", -1); 336 | 337 | for (auto& buf : manifoldBuffers) 338 | { 339 | for (auto& pair : buf.pairs) 340 | { 341 | manifoldMap.insert(pair); 342 | manifolds.push_back(Manifold(pair.first, pair.second, manifolds.size * kMaxContactPoints)); 343 | } 344 | } 345 | } 346 | 347 | void Collider::UpdatePairsOne(RigidBody* bodies, size_t bodyIndex1, size_t startIndex, size_t endIndex, ManifoldDeferredBuffer& buffer) 348 | { 349 | const BroadphaseEntry& be1 = broadphase[bodyIndex1]; 350 | float maxx = be1.maxx; 351 | 352 | for (size_t bodyIndex2 = startIndex; bodyIndex2 < endIndex; bodyIndex2++) 353 | { 354 | const BroadphaseEntry& be2 = broadphase[bodyIndex2]; 355 | if (be2.minx > maxx) 356 | return; 357 | 358 | if (fabsf(be2.centery - be1.centery) <= be1.extenty + be2.extenty) 359 | { 360 | if (!manifoldMap.contains(std::make_pair(be1.index, be2.index))) 361 | { 362 | buffer.pairs.push_back(std::make_pair(be1.index, be2.index)); 363 | } 364 | } 365 | } 366 | } 367 | 368 | NOINLINE void Collider::UpdateManifolds(WorkQueue& queue, RigidBody* bodies) 369 | { 370 | MICROPROFILE_SCOPEI("Physics", "UpdateManifolds", -1); 371 | 372 | contactPoints.resize_copy(manifolds.size * kMaxContactPoints); 373 | 374 | parallelFor(queue, manifolds.data, manifolds.size, 16, [&](Manifold& m, int) { 375 | UpdateManifold(m, bodies, contactPoints.data + m.pointIndex); 376 | }); 377 | } 378 | 379 | NOINLINE void Collider::PackManifolds(RigidBody* bodies) 380 | { 381 | MICROPROFILE_SCOPEI("Physics", "PackManifolds", -1); 382 | 383 | for (int manifoldIndex = 0; manifoldIndex < manifolds.size;) 384 | { 385 | Manifold& m = manifolds[manifoldIndex]; 386 | 387 | // TODO 388 | // This reduces broadphase insert/erase operations, which is good 389 | // However, current behavior causes issues with DenseHash - is it possible to improve it? 390 | if (m.pointCount == 0 && !bodies[m.body1Index].geom.aabb.Intersects(bodies[m.body2Index].geom.aabb)) 391 | { 392 | manifoldMap.erase(std::make_pair(m.body1Index, m.body2Index)); 393 | 394 | if (manifoldIndex < manifolds.size) 395 | { 396 | Manifold& me = manifolds[manifolds.size - 1]; 397 | 398 | unsigned int pointIndex = m.pointIndex; 399 | 400 | for (int i = 0; i < me.pointCount; ++i) 401 | contactPoints[pointIndex + i] = contactPoints[me.pointIndex + i]; 402 | 403 | m = me; 404 | m.pointIndex = pointIndex; 405 | } 406 | 407 | manifolds.size--; 408 | } 409 | else 410 | { 411 | manifoldIndex++; 412 | } 413 | } 414 | 415 | contactPoints.truncate(manifolds.size * kMaxContactPoints); 416 | } -------------------------------------------------------------------------------- /src/Collider.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Manifold.h" 4 | #include "base/DenseHash.h" 5 | #include "base/AlignedArray.h" 6 | 7 | namespace std 8 | { 9 | template <> 10 | struct hash> 11 | { 12 | size_t operator()(const std::pair& p) const 13 | { 14 | unsigned int lb = p.first; 15 | unsigned int rb = p.second; 16 | 17 | return lb ^ (rb + 0x9e3779b9 + (lb << 6) + (lb >> 2)); 18 | } 19 | }; 20 | } 21 | 22 | class WorkQueue; 23 | 24 | struct Collider 25 | { 26 | Collider(); 27 | 28 | void UpdateBroadphase(RigidBody* bodies, size_t bodiesCount); 29 | void UpdatePairs(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount); 30 | void UpdatePairsSerial(RigidBody* bodies, size_t bodiesCount); 31 | void UpdatePairsParallel(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount); 32 | 33 | struct ManifoldDeferredBuffer; 34 | 35 | void UpdatePairsOne(RigidBody* bodies, size_t bodyIndex1, size_t startIndex, size_t endIndex, ManifoldDeferredBuffer& buffer); 36 | 37 | void UpdateManifolds(WorkQueue& queue, RigidBody* bodies); 38 | void PackManifolds(RigidBody* bodies); 39 | 40 | struct ManifoldDeferredBuffer 41 | { 42 | AlignedArray> pairs; 43 | }; 44 | 45 | struct BroadphaseEntry 46 | { 47 | float minx, maxx; 48 | float centery, extenty; 49 | unsigned int index; 50 | }; 51 | 52 | struct BroadphaseSortEntry 53 | { 54 | unsigned int value; 55 | unsigned int index; 56 | }; 57 | 58 | DenseHashSet> manifoldMap; 59 | 60 | AlignedArray manifolds; 61 | AlignedArray contactPoints; 62 | 63 | std::vector manifoldBuffers; 64 | 65 | AlignedArray broadphase; 66 | AlignedArray broadphaseSort[2]; 67 | }; -------------------------------------------------------------------------------- /src/Configuration.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | struct Configuration 4 | { 5 | enum SolveMode 6 | { 7 | Solve_Scalar, 8 | Solve_SSE2, 9 | Solve_AVX2, 10 | }; 11 | 12 | enum IslandMode 13 | { 14 | Island_Single, 15 | Island_Multiple, 16 | Island_SingleSloppy, 17 | Island_MultipleSloppy 18 | }; 19 | 20 | SolveMode solveMode; 21 | IslandMode islandMode; 22 | int contactIterationsCount; 23 | int penetrationIterationsCount; 24 | }; -------------------------------------------------------------------------------- /src/Coords2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | class Coords2 5 | { 6 | public: 7 | Vector2 xVector, yVector; 8 | Vector2 pos; 9 | Coords2() {} 10 | Coords2(const Vector2& _pos, const T angle) 11 | { 12 | float pi = 3.141592f; 13 | xVector = Vector2(cos(angle), sin(angle)); 14 | yVector = Vector2(cos(angle + T(pi) / T(2.0)), sin(angle + T(pi) / T(2.0))); 15 | 16 | pos = _pos; 17 | } 18 | 19 | const Vector2 GetPointRelativePos(const Vector2& globalPoint) const 20 | { 21 | Vector2 delta = globalPoint - pos; 22 | return Vector2(delta * xVector, 23 | delta * yVector); 24 | } 25 | 26 | const Vector2 GetAxisRelativeOrientation(const Vector2& globalAxis) const 27 | { 28 | Vector2 delta = globalAxis; 29 | return Vector2(delta * xVector, 30 | delta * yVector); 31 | } 32 | 33 | const Vector2 GetPointGlobalPos(const Vector2& relativePoint) const 34 | { 35 | return pos + xVector * relativePoint.x + yVector * relativePoint.y; 36 | } 37 | 38 | const Vector2 GetAxisGlobalOrientation(const Vector2& relativeAxis) const 39 | { 40 | return xVector * relativeAxis.x + yVector * relativeAxis.y; 41 | } 42 | 43 | const Coords2 GetGlobalCoords(const Coords2& localCoords) 44 | { 45 | Coords2 res; 46 | res.pos = GetPointGlobalPos(localCoords.pos); 47 | res.xVector = GetAxisGlobalOrientation(localCoords.xVector); 48 | res.yVector = GetAxisGlobalOrientation(localCoords.yVector); 49 | return res; 50 | } 51 | 52 | const Coords2 GetLocalCoords(const Coords2& globalCoords) 53 | { 54 | Coords2 res; 55 | res.pos = GetPointRelativePos(globalCoords.pos); 56 | res.xVector = GetAxisRelativeOrientation(globalCoords.xVector); 57 | res.yVector = GetAxisRelativeOrientation(globalCoords.yVector); 58 | return res; 59 | } 60 | 61 | void Identity() 62 | { 63 | xVector = Vector2(1.0f, 0.0f); 64 | yVector = Vector2(0.0f, 1.0f); 65 | pos = Vector2::zeroVector(); 66 | } 67 | 68 | void SetRotation(const T& angle) 69 | { 70 | float pi = 3.141592f; 71 | xVector = Vector2(cos(angle), sin(angle)); 72 | yVector = Vector2(cos(angle + pi / 2.0), sin(angle + pi / 2.0)); 73 | } 74 | 75 | void Rotate(const T& angle) 76 | { 77 | this->xVector.Rotate(angle); 78 | this->yVector.Rotate(angle); 79 | } 80 | 81 | static const Coords2 defCoords() 82 | { 83 | Coords2 coords; 84 | coords.pos = Vector2::zeroVector(); 85 | coords.xVector = Vector2::xAxis(); 86 | coords.yVector = Vector2::yAxis(); 87 | return coords; 88 | } 89 | }; 90 | 91 | typedef Coords2 Coords2f; 92 | typedef Coords2 Coords2d; 93 | 94 | typedef Coords2f Coords2f; 95 | typedef Coords2d Coords2d; 96 | -------------------------------------------------------------------------------- /src/Geom.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Vector2.h" 3 | #include "Coords2.h" 4 | #include "AABB2.h" 5 | #include 6 | 7 | struct RigidBody; 8 | struct Geom 9 | { 10 | Vector2f GetClippingVertex(const Vector2f& axis) const 11 | { 12 | Vector2f xdim = coords.xVector * size.x; 13 | Vector2f ydim = coords.yVector * size.y; 14 | 15 | float xsgn = coords.xVector * axis < 0.0f ? -1.0f : 1.0f; 16 | float ysgn = coords.yVector * axis < 0.0f ? -1.0f : 1.0f; 17 | 18 | return coords.pos + xsgn * xdim + ysgn * ydim; 19 | } 20 | 21 | bool GetClippingEdge(const Vector2f& axis, Vector2f& edgepoint1, Vector2f& edgepoint2) const 22 | { 23 | edgepoint1 = coords.pos; 24 | edgepoint2 = coords.pos; 25 | Vector2f xdim = coords.xVector * size.x; 26 | Vector2f ydim = coords.yVector * size.y; 27 | Vector2f offset = Vector2f::zero(); 28 | float xdiff = axis * coords.xVector; 29 | float ydiff = axis * coords.yVector; 30 | 31 | if (fabsf(xdiff) < fabsf(ydiff)) 32 | { 33 | if (axis * ydim > 0.0f) 34 | { 35 | offset += ydim; 36 | edgepoint1 += xdim; 37 | edgepoint2 -= xdim; 38 | } 39 | else 40 | { 41 | offset -= ydim; 42 | edgepoint1 -= xdim; 43 | edgepoint2 += xdim; 44 | } 45 | } 46 | else 47 | { 48 | if (axis * xdim > 0.0f) 49 | { 50 | offset += xdim; 51 | edgepoint1 -= ydim; 52 | edgepoint2 += ydim; 53 | } 54 | else 55 | { 56 | offset -= xdim; 57 | edgepoint1 += ydim; 58 | edgepoint2 -= ydim; 59 | } 60 | } 61 | edgepoint1 += offset; 62 | edgepoint2 += offset; 63 | return 1; 64 | } 65 | 66 | int GetSupportPointSet(const Vector2f& axis, Vector2f* supportPoints) 67 | { 68 | if ((fabsf(axis * coords.xVector) < 0.1f) || 69 | (fabsf(axis * coords.yVector) < 0.1f)) 70 | { 71 | GetClippingEdge(axis, supportPoints[0], supportPoints[1]); 72 | return 2; 73 | } 74 | 75 | supportPoints[0] = GetClippingVertex(axis); 76 | return 1; 77 | } 78 | 79 | void RecomputeAABB() 80 | { 81 | Vector2f diff = Vector2f( 82 | fabsf(coords.xVector.x) * size.x + fabsf(coords.yVector.x) * size.y, 83 | fabsf(coords.xVector.y) * size.x + fabsf(coords.yVector.y) * size.y); 84 | aabb.Set(coords.pos - diff, coords.pos + diff); 85 | } 86 | 87 | Vector2f size; 88 | Coords2f coords; 89 | AABB2f aabb; 90 | }; -------------------------------------------------------------------------------- /src/Joints.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "RigidBody.h" 4 | #include "Manifold.h" 5 | 6 | struct ContactJoint 7 | { 8 | ContactJoint(int body1Index, int body2Index, int collisionIndex) 9 | { 10 | this->contactPointIndex = collisionIndex; 11 | this->body1Index = body1Index; 12 | this->body2Index = body2Index; 13 | 14 | normalLimiter_accumulatedImpulse = 0.f; 15 | frictionLimiter_accumulatedImpulse = 0.f; 16 | } 17 | 18 | int contactPointIndex; 19 | int body1Index; 20 | int body2Index; 21 | float normalLimiter_accumulatedImpulse; 22 | float frictionLimiter_accumulatedImpulse; 23 | }; -------------------------------------------------------------------------------- /src/Manifold.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Vector2.h" 4 | #include "Geom.h" 5 | #include "RigidBody.h" 6 | 7 | #include 8 | #include 9 | 10 | static const int kMaxContactPoints = 2; 11 | 12 | struct ContactPoint 13 | { 14 | ContactPoint() 15 | { 16 | } 17 | 18 | ContactPoint(Vector2f point1, const Vector2f& point2, const Vector2f normal, RigidBody* body1, RigidBody* body2) 19 | { 20 | this->delta1 = point1 - body1->coords.pos; 21 | this->delta2 = point2 - body2->coords.pos; 22 | this->normal = normal; 23 | isMerged = 0; 24 | isNewlyCreated = 1; 25 | solverIndex = -1; 26 | } 27 | 28 | bool Equals(const ContactPoint& other, float tolerance) const 29 | { 30 | if (((other.delta1 - delta1).SquareLen() > tolerance * tolerance) && 31 | ((other.delta2 - delta2).SquareLen() > tolerance * tolerance)) 32 | { 33 | return 0; 34 | } 35 | return 1; 36 | } 37 | 38 | Vector2f delta1, delta2; 39 | Vector2f normal; 40 | bool isMerged; 41 | bool isNewlyCreated; 42 | int solverIndex; 43 | }; 44 | 45 | struct Manifold 46 | { 47 | Manifold() 48 | { 49 | body1Index = -1; 50 | body2Index = -1; 51 | pointCount = 0; 52 | pointIndex = 0; 53 | } 54 | 55 | Manifold(int body1Index, int body2Index, int pointIndex) 56 | { 57 | this->body1Index = body1Index; 58 | this->body2Index = body2Index; 59 | this->pointCount = 0; 60 | this->pointIndex = pointIndex; 61 | } 62 | 63 | int body1Index; 64 | int body2Index; 65 | 66 | int pointCount; 67 | int pointIndex; 68 | }; -------------------------------------------------------------------------------- /src/RigidBody.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Vector2.h" 3 | #include "Coords2.h" 4 | #include "Geom.h" 5 | 6 | #ifdef _MSC_VER 7 | #define NOINLINE __declspec(noinline) 8 | #else 9 | #define NOINLINE __attribute__((noinline)) 10 | #endif 11 | 12 | struct RigidBody 13 | { 14 | RigidBody() {} 15 | RigidBody(Coords2f coords, Vector2f size, float density) 16 | { 17 | this->coords = coords; 18 | displacingVelocity = Vector2f(0.0f, 0.0f); 19 | displacingAngularVelocity = 0.0f; 20 | 21 | acceleration = Vector2f(0.0f, 0.0f); 22 | angularAcceleration = 0.0f; 23 | 24 | velocity = Vector2f(0.0f, 0.0f); 25 | angularVelocity = 0.0f; 26 | 27 | geom.size = size; 28 | 29 | float mass = density * (size.x * size.y); 30 | float inertia = mass * (size.x * size.x + size.y * size.y); 31 | 32 | invMass = 1.0f / mass; 33 | invInertia = 1.0f / inertia; 34 | 35 | UpdateGeom(); 36 | } 37 | 38 | void UpdateGeom() 39 | { 40 | geom.coords = coords; 41 | geom.RecomputeAABB(); 42 | } 43 | 44 | unsigned int index; 45 | 46 | Geom geom; 47 | 48 | Vector2f velocity, acceleration; 49 | Vector2f displacingVelocity; 50 | float angularVelocity, angularAcceleration; 51 | float displacingAngularVelocity; 52 | 53 | float invMass, invInertia; 54 | Coords2f coords; 55 | 56 | int lastIteration; 57 | int lastDisplacementIteration; 58 | }; -------------------------------------------------------------------------------- /src/Solver.cpp: -------------------------------------------------------------------------------- 1 | #include "Solver.h" 2 | 3 | #include "base/Parallel.h" 4 | #include "base/SIMD.h" 5 | 6 | #include "Configuration.h" 7 | 8 | const float kProductiveImpulse = 1e-4f; 9 | const float kFrictionCoefficient = 0.3f; 10 | 11 | const int kIslandMinSize = 256; 12 | 13 | Solver::Solver() 14 | { 15 | } 16 | 17 | NOINLINE void Solver::SolveJoints(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration) 18 | { 19 | switch (configuration.solveMode) 20 | { 21 | case Configuration::Solve_AVX2: 22 | #ifdef __AVX2__ 23 | SolveJoints_AVX2(queue, bodies, bodiesCount, contactPoints, configuration); 24 | break; 25 | #endif 26 | 27 | case Configuration::Solve_SSE2: 28 | #ifdef __SSE2__ 29 | SolveJoints_SSE2(queue, bodies, bodiesCount, contactPoints, configuration); 30 | break; 31 | #endif 32 | 33 | case Configuration::Solve_Scalar: 34 | SolveJoints_Scalar(queue, bodies, bodiesCount, contactPoints, configuration); 35 | break; 36 | 37 | default: 38 | assert(!"Unknown solver mode"); 39 | break; 40 | } 41 | } 42 | 43 | NOINLINE void Solver::SolveJoints_Scalar(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration) 44 | { 45 | MICROPROFILE_SCOPEI("Physics", "SolveJoints_Scalar", -1); 46 | 47 | SolveJoints(queue, joint_packed1, bodies, bodiesCount, contactPoints, configuration); 48 | } 49 | 50 | #ifdef __SSE2__ 51 | NOINLINE void Solver::SolveJoints_SSE2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration) 52 | { 53 | MICROPROFILE_SCOPEI("Physics", "SolveJoints_SSE2", -1); 54 | 55 | SolveJoints(queue, joint_packed4, bodies, bodiesCount, contactPoints, configuration); 56 | } 57 | #endif 58 | 59 | #ifdef __AVX2__ 60 | NOINLINE void Solver::SolveJoints_AVX2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration) 61 | { 62 | MICROPROFILE_SCOPEI("Physics", "SolveJoints_AVX2", -1); 63 | 64 | SolveJoints(queue, joint_packed8, bodies, bodiesCount, contactPoints, configuration); 65 | } 66 | #endif 67 | 68 | template 69 | void Solver::SolveJoints(WorkQueue& queue, AlignedArray>& joint_packed, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration) 70 | { 71 | PrepareBodies(bodies, bodiesCount); 72 | 73 | bool splitIslands = (configuration.islandMode == Configuration::Island_Multiple || configuration.islandMode == Configuration::Island_MultipleSloppy); 74 | 75 | if (splitIslands) 76 | { 77 | int jointCountAligned = GatherIslands(bodies, bodiesCount, N); 78 | 79 | joint_packed.resize(jointCountAligned); 80 | jointGroup_joints.resize(jointCountAligned); 81 | jointGroup_bodies.resize(bodiesCount); 82 | 83 | for (int i = 0; i < bodiesCount; ++i) 84 | jointGroup_bodies[i] = 0; 85 | 86 | parallelFor(queue, 0, islandCount, 1, [&](int islandIndex, int) { 87 | int jointsBegin = island_offset[islandIndex]; 88 | int jointsEnd = jointsBegin + island_size[islandIndex]; 89 | 90 | SolveJointIsland(queue, joint_packed, jointsBegin, jointsEnd, contactPoints, configuration); 91 | }); 92 | } 93 | else 94 | { 95 | int jointCount = contactJoints.size; 96 | 97 | joint_index.resize(jointCount); 98 | joint_packed.resize(jointCount); 99 | jointGroup_joints.resize(jointCount); 100 | jointGroup_bodies.resize(bodiesCount); 101 | 102 | for (int i = 0; i < jointCount; ++i) 103 | joint_index[i] = i; 104 | 105 | for (int i = 0; i < bodiesCount; ++i) 106 | jointGroup_bodies[i] = 0; 107 | 108 | islandCount = 1; 109 | islandMaxSize = jointCount; 110 | 111 | SolveJointIsland(queue, joint_packed, 0, jointCount, contactPoints, configuration); 112 | } 113 | 114 | FinishBodies(bodies, bodiesCount); 115 | 116 | MICROPROFILE_COUNTER_SET("physics/islands", islandCount); 117 | MICROPROFILE_COUNTER_SET("physics/bodies", bodiesCount); 118 | MICROPROFILE_COUNTER_SET("physics/joints", contactJoints.size); 119 | } 120 | 121 | static bool any(const AlignedArray& v) 122 | { 123 | for (int i = 0; i < v.size; ++i) 124 | if (v[i]) 125 | return true; 126 | 127 | return false; 128 | } 129 | 130 | template 131 | NOINLINE void Solver::SolveJointIsland(WorkQueue& queue, AlignedArray>& joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints, const Configuration& configuration) 132 | { 133 | MICROPROFILE_SCOPEI("Physics", "SolveJointIsland", -1); 134 | 135 | int groupOffset = PrepareJoints(queue, joint_packed, jointBegin, jointEnd, N); 136 | 137 | bool sloppy = (configuration.islandMode == Configuration::Island_SingleSloppy || configuration.islandMode == Configuration::Island_MultipleSloppy); 138 | int batchSize = sloppy ? 512 : jointEnd - jointBegin; 139 | int batchCount = ((jointEnd - jointBegin) + batchSize - 1) / batchSize; 140 | 141 | { 142 | MICROPROFILE_SCOPEI("Physics", "Prepare", -1); 143 | 144 | { 145 | MICROPROFILE_SCOPEI("Physics", "RefreshJoints", -1); 146 | 147 | parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) { 148 | int batchBegin = jointBegin + batchIndex * batchSize; 149 | int batchEnd = std::min(batchBegin + batchSize, jointEnd); 150 | 151 | RefreshJoints(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd), contactPoints); 152 | RefreshJoints<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd, contactPoints); 153 | }); 154 | } 155 | 156 | { 157 | MICROPROFILE_SCOPEI("Physics", "PreStepJoints", -1); 158 | 159 | parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) { 160 | int batchBegin = jointBegin + batchIndex * batchSize; 161 | int batchEnd = std::min(batchBegin + batchSize, jointEnd); 162 | 163 | PreStepJoints(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd)); 164 | PreStepJoints<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd); 165 | }); 166 | } 167 | } 168 | 169 | AlignedArray productivew; 170 | productivew.resize(queue.getWorkerCount() + 1); 171 | 172 | { 173 | MICROPROFILE_SCOPEI("Physics", "Impulse", -1); 174 | 175 | for (int iterationIndex = 0; iterationIndex < configuration.contactIterationsCount; iterationIndex++) 176 | { 177 | MICROPROFILE_SCOPEI("Physics", "ImpulseIteration", -1); 178 | 179 | memset(productivew.data, 0, productivew.size * sizeof(bool)); 180 | 181 | parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) { 182 | int batchBegin = jointBegin + batchIndex * batchSize; 183 | int batchEnd = std::min(batchBegin + batchSize, jointEnd); 184 | 185 | productivew[worker] |= SolveJointsImpulses(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd), iterationIndex); 186 | productivew[worker] |= SolveJointsImpulses<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd, iterationIndex); 187 | }); 188 | 189 | if (!any(productivew)) break; 190 | } 191 | } 192 | 193 | { 194 | MICROPROFILE_SCOPEI("Physics", "Displacement", -1); 195 | 196 | for (int iterationIndex = 0; iterationIndex < configuration.penetrationIterationsCount; iterationIndex++) 197 | { 198 | MICROPROFILE_SCOPEI("Physics", "DisplacementIteration", -1); 199 | 200 | memset(productivew.data, 0, productivew.size * sizeof(bool)); 201 | 202 | parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) { 203 | int batchBegin = jointBegin + batchIndex * batchSize; 204 | int batchEnd = std::min(batchBegin + batchSize, jointEnd); 205 | 206 | productivew[worker] |= SolveJointsDisplacement(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd), iterationIndex); 207 | productivew[worker] |= SolveJointsDisplacement<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd, iterationIndex); 208 | }); 209 | 210 | if (!any(productivew)) break; 211 | } 212 | } 213 | 214 | FinishJoints(queue, joint_packed, jointBegin, jointEnd); 215 | } 216 | 217 | NOINLINE int Solver::PrepareIndices(int jointBegin, int jointEnd, int groupSizeTarget) 218 | { 219 | MICROPROFILE_SCOPEI("Physics", "PrepareIndices", -1); 220 | 221 | if (groupSizeTarget == 1) 222 | return jointEnd; 223 | 224 | for (int i = jointBegin; i < jointEnd; ++i) 225 | jointGroup_joints[i] = joint_index[i]; 226 | 227 | int tag = 0; 228 | 229 | int remainingJoints = jointEnd - jointBegin; 230 | int groupOffset = jointBegin; 231 | 232 | while (remainingJoints >= groupSizeTarget) 233 | { 234 | // gather a group of N joints with non-overlapping bodies 235 | int groupSize = 0; 236 | 237 | tag++; 238 | 239 | for (int i = 0; i < remainingJoints && groupSize < groupSizeTarget;) 240 | { 241 | int jointIndex = jointGroup_joints[jointBegin + i]; 242 | ContactJoint& joint = contactJoints[jointIndex]; 243 | 244 | // TODO: race between static bodies from different islands 245 | if (jointGroup_bodies[joint.body1Index] < tag && jointGroup_bodies[joint.body2Index] < tag) 246 | { 247 | jointGroup_bodies[joint.body1Index] = tag; 248 | jointGroup_bodies[joint.body2Index] = tag; 249 | 250 | joint_index[groupOffset + groupSize] = jointIndex; 251 | groupSize++; 252 | 253 | jointGroup_joints[jointBegin + i] = jointGroup_joints[jointBegin + remainingJoints - 1]; 254 | remainingJoints--; 255 | } 256 | else 257 | { 258 | i++; 259 | } 260 | } 261 | 262 | groupOffset += groupSize; 263 | 264 | if (groupSize < groupSizeTarget) 265 | break; 266 | } 267 | 268 | // fill in the rest of the joints sequentially - they don't form a group so we'll have to solve them 1 by 1 269 | for (int i = 0; i < remainingJoints; ++i) 270 | joint_index[groupOffset + i] = jointGroup_joints[jointBegin + i]; 271 | 272 | return groupOffset & ~(groupSizeTarget - 1); 273 | } 274 | 275 | static int remap(AlignedArray& table, int index) 276 | { 277 | int result = index; 278 | 279 | while (result != table[result]) 280 | result = table[result]; 281 | 282 | return table[index] = result; 283 | } 284 | 285 | NOINLINE int Solver::GatherIslands(RigidBody* bodies, int bodiesCount, int groupSizeTarget) 286 | { 287 | MICROPROFILE_SCOPEI("Physics", "GatherIslands", -1); 288 | 289 | int jointCount = contactJoints.size; 290 | int jointCountAligned = jointCount; 291 | 292 | island_remap.resize(bodiesCount); 293 | island_index.resize(bodiesCount); 294 | island_indexremap.resize(bodiesCount); 295 | island_offset.resize(bodiesCount); 296 | island_offsettemp.resize(bodiesCount); 297 | island_size.resize(bodiesCount); 298 | 299 | { 300 | MICROPROFILE_SCOPEI("Physics", "Prepare", -1); 301 | 302 | for (int i = 0; i < bodiesCount; ++i) 303 | { 304 | island_remap[i] = (bodies[i].invMass == 0 && bodies[i].invInertia == 0) ? -1 : i; 305 | } 306 | } 307 | 308 | { 309 | MICROPROFILE_SCOPEI("Physics", "Merge", -1); 310 | 311 | for (ContactJoint& j: contactJoints) 312 | { 313 | int island1 = island_remap[j.body1Index]; 314 | int island2 = island_remap[j.body2Index]; 315 | 316 | if ((island1 | island2) < 0) 317 | continue; 318 | 319 | int remap1 = remap(island_remap, island1); 320 | int remap2 = remap(island_remap, island2); 321 | 322 | island_remap[remap1] = remap2; 323 | } 324 | } 325 | 326 | { 327 | MICROPROFILE_SCOPEI("Physics", "Gather", -1); 328 | 329 | islandCount = 0; 330 | 331 | for (int i = 0; i < bodiesCount; ++i) 332 | { 333 | island_index[i] = -1; 334 | } 335 | 336 | for (int i = 0; i < bodiesCount; ++i) 337 | { 338 | if (island_remap[i] < 0) 339 | continue; 340 | 341 | island_remap[i] = remap(island_remap, i); 342 | } 343 | 344 | for (int i = 0; i < bodiesCount; ++i) 345 | { 346 | if (island_remap[i] < 0) 347 | continue; 348 | 349 | int island = island_remap[i]; 350 | 351 | if (island_index[island] < 0) 352 | { 353 | island_index[island] = islandCount; 354 | islandCount++; 355 | } 356 | } 357 | } 358 | 359 | { 360 | MICROPROFILE_SCOPEI("Physics", "Count", -1); 361 | 362 | for (int i = 0; i < islandCount; ++i) 363 | { 364 | island_offset[i] = 0; 365 | } 366 | 367 | for (ContactJoint& j: contactJoints) 368 | { 369 | int island1 = island_remap[j.body1Index]; 370 | int island2 = island_remap[j.body2Index]; 371 | 372 | if ((island1 & island2) < 0) 373 | continue; 374 | 375 | assert(island1 == island2 || ((island1 | island2) < 0 && (island1 & island2) >= 0)); 376 | int island = island1 < 0 ? island2 : island1; 377 | 378 | island_offset[island_index[island]]++; 379 | } 380 | } 381 | 382 | { 383 | MICROPROFILE_SCOPEI("Physics", "Coalesce", -1); 384 | 385 | for (int i = 0; i < islandCount; ++i) 386 | { 387 | island_indexremap[i] = i; 388 | } 389 | 390 | int runningIndex = 0; 391 | int runningCount = 0; 392 | int totalCount = 0; 393 | 394 | for (int i = 0; i < islandCount; ++i) 395 | { 396 | runningCount += island_offset[i]; 397 | 398 | island_indexremap[i] = runningIndex; 399 | 400 | if (runningCount >= kIslandMinSize || (runningCount > 0 && i == islandCount - 1)) 401 | { 402 | int runningCountAligned = (runningCount + groupSizeTarget - 1) & ~(groupSizeTarget - 1); 403 | 404 | island_size[runningIndex] = runningCount; 405 | island_offset[runningIndex] = totalCount; 406 | 407 | totalCount += runningCountAligned; 408 | runningCount = 0; 409 | runningIndex++; 410 | } 411 | } 412 | 413 | jointCountAligned = totalCount; 414 | islandCount = runningIndex; 415 | } 416 | 417 | joint_index.resize(jointCountAligned); 418 | 419 | { 420 | MICROPROFILE_SCOPEI("Physics", "Index", -1); 421 | 422 | for (int i = 0; i < islandCount; ++i) 423 | { 424 | island_offsettemp[i] = island_offset[i]; 425 | } 426 | 427 | for (int jointIndex = 0; jointIndex < jointCount; ++jointIndex) 428 | { 429 | ContactJoint& j = contactJoints[jointIndex]; 430 | 431 | int island1 = island_remap[j.body1Index]; 432 | int island2 = island_remap[j.body2Index]; 433 | 434 | if ((island1 & island2) < 0) 435 | continue; 436 | 437 | assert(island1 == island2 || ((island1 | island2) < 0 && (island1 & island2) >= 0)); 438 | int island = island1 < 0 ? island2 : island1; 439 | 440 | joint_index[island_offsettemp[island_indexremap[island_index[island]]]++] = jointIndex; 441 | } 442 | 443 | islandMaxSize = 0; 444 | 445 | for (int i = 0; i < islandCount; ++i) 446 | { 447 | assert(island_offsettemp[i] == island_offset[i] + island_size[i]); 448 | 449 | islandMaxSize = std::max(islandMaxSize, island_size[i]); 450 | } 451 | } 452 | 453 | return jointCountAligned; 454 | } 455 | 456 | NOINLINE void Solver::PrepareBodies(RigidBody* bodies, int bodiesCount) 457 | { 458 | MICROPROFILE_SCOPEI("Physics", "PrepareBodies", -1); 459 | 460 | solveBodiesParams.resize(bodiesCount); 461 | solveBodiesImpulse.resize(bodiesCount); 462 | solveBodiesDisplacement.resize(bodiesCount); 463 | 464 | for (int i = 0; i < bodiesCount; ++i) 465 | { 466 | solveBodiesParams[i].invMass = bodies[i].invMass; 467 | solveBodiesParams[i].invInertia = bodies[i].invInertia; 468 | solveBodiesParams[i].coords_pos = bodies[i].coords.pos; 469 | solveBodiesParams[i].coords_xVector = bodies[i].coords.xVector; 470 | solveBodiesParams[i].coords_yVector = bodies[i].coords.yVector; 471 | 472 | solveBodiesImpulse[i].velocity = bodies[i].velocity; 473 | solveBodiesImpulse[i].angularVelocity = bodies[i].angularVelocity; 474 | solveBodiesImpulse[i].lastIteration = -1; 475 | 476 | solveBodiesDisplacement[i].velocity = bodies[i].displacingVelocity; 477 | solveBodiesDisplacement[i].angularVelocity = bodies[i].displacingAngularVelocity; 478 | solveBodiesDisplacement[i].lastIteration = -1; 479 | } 480 | } 481 | 482 | NOINLINE void Solver::FinishBodies(RigidBody* bodies, int bodiesCount) 483 | { 484 | MICROPROFILE_SCOPEI("Physics", "FinishBodies", -1); 485 | 486 | for (int i = 0; i < bodiesCount; ++i) 487 | { 488 | bodies[i].velocity = solveBodiesImpulse[i].velocity; 489 | bodies[i].angularVelocity = solveBodiesImpulse[i].angularVelocity; 490 | 491 | bodies[i].displacingVelocity = solveBodiesDisplacement[i].velocity; 492 | bodies[i].displacingAngularVelocity = solveBodiesDisplacement[i].angularVelocity; 493 | } 494 | } 495 | 496 | template 497 | NOINLINE int Solver::PrepareJoints(WorkQueue& queue, AlignedArray>& joint_packed, int jointBegin, int jointEnd, int groupSizeTarget) 498 | { 499 | MICROPROFILE_SCOPEI("Physics", "PrepareJoints", -1); 500 | 501 | assert(jointBegin % groupSizeTarget == 0); 502 | assert(jointBegin % N == 0); 503 | 504 | int groupOffset = PrepareIndices(jointBegin, jointEnd, groupSizeTarget); 505 | 506 | { 507 | MICROPROFILE_SCOPEI("Physics", "CopyJoints", -1); 508 | 509 | parallelFor(queue, 0, jointEnd - jointBegin, 128, [&](int i, int) { 510 | ContactJoint& joint = contactJoints[joint_index[jointBegin + i]]; 511 | 512 | ContactJointPacked& jointP = joint_packed[unsigned(jointBegin + i) / N]; 513 | int iP = i & (N - 1); 514 | 515 | jointP.body1Index[iP] = joint.body1Index; 516 | jointP.body2Index[iP] = joint.body2Index; 517 | jointP.contactPointIndex[iP] = joint.contactPointIndex; 518 | 519 | jointP.normalLimiter_accumulatedImpulse[iP] = joint.normalLimiter_accumulatedImpulse; 520 | jointP.frictionLimiter_accumulatedImpulse[iP] = joint.frictionLimiter_accumulatedImpulse; 521 | }); 522 | } 523 | 524 | return groupOffset; 525 | } 526 | 527 | template 528 | NOINLINE void Solver::FinishJoints(WorkQueue& queue, AlignedArray>& joint_packed, int jointBegin, int jointEnd) 529 | { 530 | MICROPROFILE_SCOPEI("Physics", "FinishJoints", -1); 531 | 532 | assert(jointBegin % N == 0); 533 | 534 | { 535 | MICROPROFILE_SCOPEI("Physics", "CopyJoints", -1); 536 | 537 | parallelFor(queue, 0, jointEnd - jointBegin, 128, [&](int i, int) { 538 | ContactJoint& joint = contactJoints[joint_index[jointBegin + i]]; 539 | 540 | ContactJointPacked& jointP = joint_packed[unsigned(jointBegin + i) / N]; 541 | int iP = i & (N - 1); 542 | 543 | joint.normalLimiter_accumulatedImpulse = jointP.normalLimiter_accumulatedImpulse[iP]; 544 | joint.frictionLimiter_accumulatedImpulse = jointP.frictionLimiter_accumulatedImpulse[iP]; 545 | }); 546 | } 547 | } 548 | 549 | template 550 | static void RefreshLimiter( 551 | ContactLimiterPacked& limiter, int iP, 552 | const Vf& n1X, const Vf& n1Y, const Vf& n2X, const Vf& n2Y, const Vf& w1X, const Vf& w1Y, const Vf& w2X, const Vf& w2Y, 553 | const Vf& body1_invMass, const Vf& body1_invInertia, const Vf& body2_invMass, const Vf& body2_invInertia) 554 | { 555 | Vf normalProjector1X = n1X; 556 | Vf normalProjector1Y = n1Y; 557 | Vf normalProjector2X = n2X; 558 | Vf normalProjector2Y = n2Y; 559 | Vf angularProjector1 = n1X * w1Y - n1Y * w1X; 560 | Vf angularProjector2 = n2X * w2Y - n2Y * w2X; 561 | 562 | Vf compMass1_linearX = normalProjector1X * body1_invMass; 563 | Vf compMass1_linearY = normalProjector1Y * body1_invMass; 564 | Vf compMass1_angular = angularProjector1 * body1_invInertia; 565 | Vf compMass2_linearX = normalProjector2X * body2_invMass; 566 | Vf compMass2_linearY = normalProjector2Y * body2_invMass; 567 | Vf compMass2_angular = angularProjector2 * body2_invInertia; 568 | 569 | Vf compMass1 = normalProjector1X * compMass1_linearX + normalProjector1Y * compMass1_linearY + angularProjector1 * compMass1_angular; 570 | Vf compMass2 = normalProjector2X * compMass2_linearX + normalProjector2Y * compMass2_linearY + angularProjector2 * compMass2_angular; 571 | 572 | Vf compMass = compMass1 + compMass2; 573 | 574 | Vf compInvMass = select(Vf::zero(), Vf::one(1) / compMass, abs(compMass) > Vf::zero()); 575 | 576 | store(normalProjector1X, &limiter.normalProjector1X[iP]); 577 | store(normalProjector1Y, &limiter.normalProjector1Y[iP]); 578 | store(normalProjector2X, &limiter.normalProjector2X[iP]); 579 | store(normalProjector2Y, &limiter.normalProjector2Y[iP]); 580 | store(angularProjector1, &limiter.angularProjector1[iP]); 581 | store(angularProjector2, &limiter.angularProjector2[iP]); 582 | 583 | store(compMass1_linearX, &limiter.compMass1_linearX[iP]); 584 | store(compMass1_linearY, &limiter.compMass1_linearY[iP]); 585 | store(compMass2_linearX, &limiter.compMass2_linearX[iP]); 586 | store(compMass2_linearY, &limiter.compMass2_linearY[iP]); 587 | store(compMass1_angular, &limiter.compMass1_angular[iP]); 588 | store(compMass2_angular, &limiter.compMass2_angular[iP]); 589 | store(compInvMass, &limiter.compInvMass[iP]); 590 | } 591 | 592 | template 593 | NOINLINE void Solver::RefreshJoints(ContactJointPacked* joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints) 594 | { 595 | typedef simd::VNf Vf; 596 | 597 | assert(jointBegin % VN == 0 && jointEnd % VN == 0); 598 | 599 | for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN) 600 | { 601 | int i = jointIndex; 602 | 603 | ContactJointPacked& jointP = joint_packed[unsigned(i) / N]; 604 | int iP = (VN == N) ? 0 : i & (N - 1); 605 | 606 | Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf; 607 | Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf; 608 | 609 | Vf body1_invMass, body1_invInertia, body1_coords_posX, body1_coords_posY; 610 | Vf body1_coords_xVectorX, body1_coords_xVectorY, body1_coords_yVectorX, body1_coords_yVectorY; 611 | 612 | Vf body2_invMass, body2_invInertia, body2_coords_posX, body2_coords_posY; 613 | Vf body2_coords_xVectorX, body2_coords_xVectorY, body2_coords_yVectorX, body2_coords_yVectorY; 614 | 615 | Vf collision_delta1X, collision_delta1Y, collision_delta2X, collision_delta2Y; 616 | Vf collision_normalX, collision_normalY; 617 | Vf dummy; 618 | 619 | loadindexed4( 620 | body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 621 | solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody)); 622 | 623 | loadindexed4( 624 | body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 625 | solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody)); 626 | 627 | loadindexed8( 628 | body1_invMass, body1_invInertia, body1_coords_posX, body1_coords_posY, 629 | body1_coords_xVectorX, body1_coords_xVectorY, body1_coords_yVectorX, body1_coords_yVectorY, 630 | solveBodiesParams.data, jointP.body1Index + iP, sizeof(SolveBodyParams)); 631 | 632 | loadindexed8( 633 | body2_invMass, body2_invInertia, body2_coords_posX, body2_coords_posY, 634 | body2_coords_xVectorX, body2_coords_xVectorY, body2_coords_yVectorX, body2_coords_yVectorY, 635 | solveBodiesParams.data, jointP.body2Index + iP, sizeof(SolveBodyParams)); 636 | 637 | loadindexed8( 638 | collision_delta1X, collision_delta1Y, collision_delta2X, collision_delta2Y, 639 | collision_normalX, collision_normalY, dummy, dummy, 640 | contactPoints, jointP.contactPointIndex + iP, sizeof(ContactPoint)); 641 | 642 | Vf point1X = collision_delta1X + body1_coords_posX; 643 | Vf point1Y = collision_delta1Y + body1_coords_posY; 644 | Vf point2X = collision_delta2X + body2_coords_posX; 645 | Vf point2Y = collision_delta2Y + body2_coords_posY; 646 | 647 | Vf w1X = collision_delta1X; 648 | Vf w1Y = collision_delta1Y; 649 | Vf w2X = point1X - body2_coords_posX; 650 | Vf w2Y = point1Y - body2_coords_posY; 651 | 652 | // Normal limiter 653 | RefreshLimiter(jointP.normalLimiter, iP, 654 | collision_normalX, collision_normalY, -collision_normalX, -collision_normalY, 655 | w1X, w1Y, w2X, w2Y, 656 | body1_invMass, body1_invInertia, body2_invMass, body2_invInertia); 657 | 658 | Vf bounce = Vf::zero(); 659 | Vf deltaVelocity = Vf::one(1.f); 660 | Vf maxPenetrationVelocity = Vf::one(0.1f); 661 | Vf deltaDepth = Vf::one(1.f); 662 | Vf errorReduction = Vf::one(0.1f); 663 | 664 | Vf pointVelocity_body1X = (body1_coords_posY - point1Y) * body1_angularVelocity + body1_velocityX; 665 | Vf pointVelocity_body1Y = (point1X - body1_coords_posX) * body1_angularVelocity + body1_velocityY; 666 | 667 | Vf pointVelocity_body2X = (body2_coords_posY - point2Y) * body2_angularVelocity + body2_velocityX; 668 | Vf pointVelocity_body2Y = (point2X - body2_coords_posX) * body2_angularVelocity + body2_velocityY; 669 | 670 | Vf relativeVelocityX = pointVelocity_body1X - pointVelocity_body2X; 671 | Vf relativeVelocityY = pointVelocity_body1Y - pointVelocity_body2Y; 672 | 673 | Vf dv = -bounce * (relativeVelocityX * collision_normalX + relativeVelocityY * collision_normalY); 674 | Vf depth = (point2X - point1X) * collision_normalX + (point2Y - point1Y) * collision_normalY; 675 | 676 | Vf dstVelocity = max(dv - deltaVelocity, Vf::zero()); 677 | 678 | Vf j_normalLimiter_dstVelocity = select(dstVelocity, dstVelocity - maxPenetrationVelocity, depth < deltaDepth); 679 | Vf j_normalLimiter_dstDisplacingVelocity = errorReduction * max(Vf::zero(), depth - Vf::one(2.0f) * deltaDepth); 680 | Vf j_normalLimiter_accumulatedDisplacingImpulse = Vf::zero(); 681 | 682 | // Friction limiter 683 | Vf tangentX = -collision_normalY; 684 | Vf tangentY = collision_normalX; 685 | 686 | RefreshLimiter(jointP.frictionLimiter, iP, 687 | tangentX, tangentY, -tangentX, -tangentY, 688 | w1X, w1Y, w2X, w2Y, 689 | body1_invMass, body1_invInertia, body2_invMass, body2_invInertia); 690 | 691 | store(j_normalLimiter_dstVelocity, &jointP.normalLimiter_dstVelocity[iP]); 692 | store(j_normalLimiter_dstDisplacingVelocity, &jointP.normalLimiter_dstDisplacingVelocity[iP]); 693 | store(j_normalLimiter_accumulatedDisplacingImpulse, &jointP.normalLimiter_accumulatedDisplacingImpulse[iP]); 694 | } 695 | } 696 | 697 | template 698 | NOINLINE void Solver::PreStepJoints(ContactJointPacked* joint_packed, int jointBegin, int jointEnd) 699 | { 700 | typedef simd::VNf Vf; 701 | 702 | assert(jointBegin % VN == 0 && jointEnd % VN == 0); 703 | 704 | for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN) 705 | { 706 | int i = jointIndex; 707 | 708 | ContactJointPacked& jointP = joint_packed[unsigned(i) / N]; 709 | int iP = (VN == N) ? 0 : i & (N - 1); 710 | 711 | Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf; 712 | Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf; 713 | 714 | loadindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 715 | solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody)); 716 | 717 | loadindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 718 | solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody)); 719 | 720 | Vf j_normalLimiter_compMass1_linearX = Vf::load(&jointP.normalLimiter.compMass1_linearX[iP]); 721 | Vf j_normalLimiter_compMass1_linearY = Vf::load(&jointP.normalLimiter.compMass1_linearY[iP]); 722 | Vf j_normalLimiter_compMass2_linearX = Vf::load(&jointP.normalLimiter.compMass2_linearX[iP]); 723 | Vf j_normalLimiter_compMass2_linearY = Vf::load(&jointP.normalLimiter.compMass2_linearY[iP]); 724 | Vf j_normalLimiter_compMass1_angular = Vf::load(&jointP.normalLimiter.compMass1_angular[iP]); 725 | Vf j_normalLimiter_compMass2_angular = Vf::load(&jointP.normalLimiter.compMass2_angular[iP]); 726 | Vf j_normalLimiter_accumulatedImpulse = Vf::load(&jointP.normalLimiter_accumulatedImpulse[iP]); 727 | 728 | Vf j_frictionLimiter_compMass1_linearX = Vf::load(&jointP.frictionLimiter.compMass1_linearX[iP]); 729 | Vf j_frictionLimiter_compMass1_linearY = Vf::load(&jointP.frictionLimiter.compMass1_linearY[iP]); 730 | Vf j_frictionLimiter_compMass2_linearX = Vf::load(&jointP.frictionLimiter.compMass2_linearX[iP]); 731 | Vf j_frictionLimiter_compMass2_linearY = Vf::load(&jointP.frictionLimiter.compMass2_linearY[iP]); 732 | Vf j_frictionLimiter_compMass1_angular = Vf::load(&jointP.frictionLimiter.compMass1_angular[iP]); 733 | Vf j_frictionLimiter_compMass2_angular = Vf::load(&jointP.frictionLimiter.compMass2_angular[iP]); 734 | Vf j_frictionLimiter_accumulatedImpulse = Vf::load(&jointP.frictionLimiter_accumulatedImpulse[iP]); 735 | 736 | body1_velocityX += j_normalLimiter_compMass1_linearX * j_normalLimiter_accumulatedImpulse; 737 | body1_velocityY += j_normalLimiter_compMass1_linearY * j_normalLimiter_accumulatedImpulse; 738 | body1_angularVelocity += j_normalLimiter_compMass1_angular * j_normalLimiter_accumulatedImpulse; 739 | 740 | body2_velocityX += j_normalLimiter_compMass2_linearX * j_normalLimiter_accumulatedImpulse; 741 | body2_velocityY += j_normalLimiter_compMass2_linearY * j_normalLimiter_accumulatedImpulse; 742 | body2_angularVelocity += j_normalLimiter_compMass2_angular * j_normalLimiter_accumulatedImpulse; 743 | 744 | body1_velocityX += j_frictionLimiter_compMass1_linearX * j_frictionLimiter_accumulatedImpulse; 745 | body1_velocityY += j_frictionLimiter_compMass1_linearY * j_frictionLimiter_accumulatedImpulse; 746 | body1_angularVelocity += j_frictionLimiter_compMass1_angular * j_frictionLimiter_accumulatedImpulse; 747 | 748 | body2_velocityX += j_frictionLimiter_compMass2_linearX * j_frictionLimiter_accumulatedImpulse; 749 | body2_velocityY += j_frictionLimiter_compMass2_linearY * j_frictionLimiter_accumulatedImpulse; 750 | body2_angularVelocity += j_frictionLimiter_compMass2_angular * j_frictionLimiter_accumulatedImpulse; 751 | 752 | storeindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 753 | solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody)); 754 | 755 | storeindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 756 | solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody)); 757 | } 758 | } 759 | 760 | template 761 | NOINLINE bool Solver::SolveJointsImpulses(ContactJointPacked* joint_packed, int jointBegin, int jointEnd, int iterationIndex) 762 | { 763 | typedef simd::VNf Vf; 764 | typedef simd::VNi Vi; 765 | typedef simd::VNb Vb; 766 | 767 | assert(jointBegin % VN == 0 && jointEnd % VN == 0); 768 | 769 | Vi iterationIndex0 = Vi::one(iterationIndex); 770 | Vi iterationIndex2 = Vi::one(iterationIndex - 2); 771 | 772 | Vb productive_any = Vb::zero(); 773 | 774 | for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN) 775 | { 776 | int i = jointIndex; 777 | 778 | ContactJointPacked& jointP = joint_packed[unsigned(i) / N]; 779 | int iP = (VN == N) ? 0 : i & (N - 1); 780 | 781 | Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf; 782 | Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf; 783 | 784 | loadindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 785 | solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody)); 786 | 787 | loadindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 788 | solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody)); 789 | 790 | Vi body1_lastIteration = bitcast(body1_lastIterationf); 791 | Vi body2_lastIteration = bitcast(body2_lastIterationf); 792 | 793 | Vb body1_productive = body1_lastIteration > iterationIndex2; 794 | Vb body2_productive = body2_lastIteration > iterationIndex2; 795 | Vb body_productive = body1_productive | body2_productive; 796 | 797 | if (none(body_productive)) 798 | continue; 799 | 800 | Vf j_normalLimiter_normalProjector1X = Vf::load(&jointP.normalLimiter.normalProjector1X[iP]); 801 | Vf j_normalLimiter_normalProjector1Y = Vf::load(&jointP.normalLimiter.normalProjector1Y[iP]); 802 | Vf j_normalLimiter_normalProjector2X = Vf::load(&jointP.normalLimiter.normalProjector2X[iP]); 803 | Vf j_normalLimiter_normalProjector2Y = Vf::load(&jointP.normalLimiter.normalProjector2Y[iP]); 804 | Vf j_normalLimiter_angularProjector1 = Vf::load(&jointP.normalLimiter.angularProjector1[iP]); 805 | Vf j_normalLimiter_angularProjector2 = Vf::load(&jointP.normalLimiter.angularProjector2[iP]); 806 | 807 | Vf j_normalLimiter_compMass1_linearX = Vf::load(&jointP.normalLimiter.compMass1_linearX[iP]); 808 | Vf j_normalLimiter_compMass1_linearY = Vf::load(&jointP.normalLimiter.compMass1_linearY[iP]); 809 | Vf j_normalLimiter_compMass2_linearX = Vf::load(&jointP.normalLimiter.compMass2_linearX[iP]); 810 | Vf j_normalLimiter_compMass2_linearY = Vf::load(&jointP.normalLimiter.compMass2_linearY[iP]); 811 | Vf j_normalLimiter_compMass1_angular = Vf::load(&jointP.normalLimiter.compMass1_angular[iP]); 812 | Vf j_normalLimiter_compMass2_angular = Vf::load(&jointP.normalLimiter.compMass2_angular[iP]); 813 | Vf j_normalLimiter_compInvMass = Vf::load(&jointP.normalLimiter.compInvMass[iP]); 814 | Vf j_normalLimiter_accumulatedImpulse = Vf::load(&jointP.normalLimiter_accumulatedImpulse[iP]); 815 | Vf j_normalLimiter_dstVelocity = Vf::load(&jointP.normalLimiter_dstVelocity[iP]); 816 | 817 | Vf j_frictionLimiter_normalProjector1X = Vf::load(&jointP.frictionLimiter.normalProjector1X[iP]); 818 | Vf j_frictionLimiter_normalProjector1Y = Vf::load(&jointP.frictionLimiter.normalProjector1Y[iP]); 819 | Vf j_frictionLimiter_normalProjector2X = Vf::load(&jointP.frictionLimiter.normalProjector2X[iP]); 820 | Vf j_frictionLimiter_normalProjector2Y = Vf::load(&jointP.frictionLimiter.normalProjector2Y[iP]); 821 | Vf j_frictionLimiter_angularProjector1 = Vf::load(&jointP.frictionLimiter.angularProjector1[iP]); 822 | Vf j_frictionLimiter_angularProjector2 = Vf::load(&jointP.frictionLimiter.angularProjector2[iP]); 823 | 824 | Vf j_frictionLimiter_compMass1_linearX = Vf::load(&jointP.frictionLimiter.compMass1_linearX[iP]); 825 | Vf j_frictionLimiter_compMass1_linearY = Vf::load(&jointP.frictionLimiter.compMass1_linearY[iP]); 826 | Vf j_frictionLimiter_compMass2_linearX = Vf::load(&jointP.frictionLimiter.compMass2_linearX[iP]); 827 | Vf j_frictionLimiter_compMass2_linearY = Vf::load(&jointP.frictionLimiter.compMass2_linearY[iP]); 828 | Vf j_frictionLimiter_compMass1_angular = Vf::load(&jointP.frictionLimiter.compMass1_angular[iP]); 829 | Vf j_frictionLimiter_compMass2_angular = Vf::load(&jointP.frictionLimiter.compMass2_angular[iP]); 830 | Vf j_frictionLimiter_compInvMass = Vf::load(&jointP.frictionLimiter.compInvMass[iP]); 831 | Vf j_frictionLimiter_accumulatedImpulse = Vf::load(&jointP.frictionLimiter_accumulatedImpulse[iP]); 832 | 833 | Vf normaldV = j_normalLimiter_dstVelocity; 834 | 835 | normaldV -= j_normalLimiter_normalProjector1X * body1_velocityX; 836 | normaldV -= j_normalLimiter_normalProjector1Y * body1_velocityY; 837 | normaldV -= j_normalLimiter_angularProjector1 * body1_angularVelocity; 838 | 839 | normaldV -= j_normalLimiter_normalProjector2X * body2_velocityX; 840 | normaldV -= j_normalLimiter_normalProjector2Y * body2_velocityY; 841 | normaldV -= j_normalLimiter_angularProjector2 * body2_angularVelocity; 842 | 843 | Vf normalDeltaImpulse = normaldV * j_normalLimiter_compInvMass; 844 | 845 | normalDeltaImpulse = max(normalDeltaImpulse, -j_normalLimiter_accumulatedImpulse); 846 | 847 | body1_velocityX += j_normalLimiter_compMass1_linearX * normalDeltaImpulse; 848 | body1_velocityY += j_normalLimiter_compMass1_linearY * normalDeltaImpulse; 849 | body1_angularVelocity += j_normalLimiter_compMass1_angular * normalDeltaImpulse; 850 | 851 | body2_velocityX += j_normalLimiter_compMass2_linearX * normalDeltaImpulse; 852 | body2_velocityY += j_normalLimiter_compMass2_linearY * normalDeltaImpulse; 853 | body2_angularVelocity += j_normalLimiter_compMass2_angular * normalDeltaImpulse; 854 | 855 | j_normalLimiter_accumulatedImpulse += normalDeltaImpulse; 856 | 857 | Vf frictiondV = Vf::zero(); 858 | 859 | frictiondV -= j_frictionLimiter_normalProjector1X * body1_velocityX; 860 | frictiondV -= j_frictionLimiter_normalProjector1Y * body1_velocityY; 861 | frictiondV -= j_frictionLimiter_angularProjector1 * body1_angularVelocity; 862 | 863 | frictiondV -= j_frictionLimiter_normalProjector2X * body2_velocityX; 864 | frictiondV -= j_frictionLimiter_normalProjector2Y * body2_velocityY; 865 | frictiondV -= j_frictionLimiter_angularProjector2 * body2_angularVelocity; 866 | 867 | Vf frictionDeltaImpulse = frictiondV * j_frictionLimiter_compInvMass; 868 | 869 | Vf reactionForce = j_normalLimiter_accumulatedImpulse; 870 | Vf accumulatedImpulse = j_frictionLimiter_accumulatedImpulse; 871 | 872 | Vf frictionForce = accumulatedImpulse + frictionDeltaImpulse; 873 | Vf reactionForceScaled = reactionForce * Vf::one(kFrictionCoefficient); 874 | 875 | Vf frictionForceAbs = abs(frictionForce); 876 | Vf reactionForceScaledSigned = flipsign(reactionForceScaled, frictionForce); 877 | Vf frictionDeltaImpulseAdjusted = reactionForceScaledSigned - accumulatedImpulse; 878 | 879 | frictionDeltaImpulse = select(frictionDeltaImpulse, frictionDeltaImpulseAdjusted, frictionForceAbs > reactionForceScaled); 880 | 881 | j_frictionLimiter_accumulatedImpulse += frictionDeltaImpulse; 882 | 883 | body1_velocityX += j_frictionLimiter_compMass1_linearX * frictionDeltaImpulse; 884 | body1_velocityY += j_frictionLimiter_compMass1_linearY * frictionDeltaImpulse; 885 | body1_angularVelocity += j_frictionLimiter_compMass1_angular * frictionDeltaImpulse; 886 | 887 | body2_velocityX += j_frictionLimiter_compMass2_linearX * frictionDeltaImpulse; 888 | body2_velocityY += j_frictionLimiter_compMass2_linearY * frictionDeltaImpulse; 889 | body2_angularVelocity += j_frictionLimiter_compMass2_angular * frictionDeltaImpulse; 890 | 891 | store(j_normalLimiter_accumulatedImpulse, &jointP.normalLimiter_accumulatedImpulse[iP]); 892 | store(j_frictionLimiter_accumulatedImpulse, &jointP.frictionLimiter_accumulatedImpulse[iP]); 893 | 894 | Vf cumulativeImpulse = max(abs(normalDeltaImpulse), abs(frictionDeltaImpulse)); 895 | 896 | Vb productive = cumulativeImpulse > Vf::one(kProductiveImpulse); 897 | 898 | productive_any |= productive; 899 | 900 | body1_lastIteration = select(body1_lastIteration, iterationIndex0, productive); 901 | body2_lastIteration = select(body2_lastIteration, iterationIndex0, productive); 902 | 903 | body1_lastIterationf = bitcast(body1_lastIteration); 904 | body2_lastIterationf = bitcast(body2_lastIteration); 905 | 906 | storeindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 907 | solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody)); 908 | 909 | storeindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 910 | solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody)); 911 | } 912 | 913 | return any(productive_any); 914 | } 915 | 916 | template 917 | NOINLINE bool Solver::SolveJointsDisplacement(ContactJointPacked* joint_packed, int jointBegin, int jointEnd, int iterationIndex) 918 | { 919 | typedef simd::VNf Vf; 920 | typedef simd::VNi Vi; 921 | typedef simd::VNb Vb; 922 | 923 | assert(jointBegin % VN == 0 && jointEnd % VN == 0); 924 | 925 | Vi iterationIndex0 = Vi::one(iterationIndex); 926 | Vi iterationIndex2 = Vi::one(iterationIndex - 2); 927 | 928 | Vb productive_any = Vb::zero(); 929 | 930 | for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN) 931 | { 932 | int i = jointIndex; 933 | 934 | ContactJointPacked& jointP = joint_packed[unsigned(i) / N]; 935 | int iP = (VN == N) ? 0 : i & (N - 1); 936 | 937 | Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf; 938 | Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf; 939 | 940 | loadindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 941 | solveBodiesDisplacement.data, jointP.body1Index + iP, sizeof(SolveBody)); 942 | 943 | loadindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 944 | solveBodiesDisplacement.data, jointP.body2Index + iP, sizeof(SolveBody)); 945 | 946 | Vi body1_lastIteration = bitcast(body1_lastIterationf); 947 | Vi body2_lastIteration = bitcast(body2_lastIterationf); 948 | 949 | Vb body1_productive = body1_lastIteration > iterationIndex2; 950 | Vb body2_productive = body2_lastIteration > iterationIndex2; 951 | Vb body_productive = body1_productive | body2_productive; 952 | 953 | if (none(body_productive)) 954 | continue; 955 | 956 | Vf j_normalLimiter_normalProjector1X = Vf::load(&jointP.normalLimiter.normalProjector1X[iP]); 957 | Vf j_normalLimiter_normalProjector1Y = Vf::load(&jointP.normalLimiter.normalProjector1Y[iP]); 958 | Vf j_normalLimiter_normalProjector2X = Vf::load(&jointP.normalLimiter.normalProjector2X[iP]); 959 | Vf j_normalLimiter_normalProjector2Y = Vf::load(&jointP.normalLimiter.normalProjector2Y[iP]); 960 | Vf j_normalLimiter_angularProjector1 = Vf::load(&jointP.normalLimiter.angularProjector1[iP]); 961 | Vf j_normalLimiter_angularProjector2 = Vf::load(&jointP.normalLimiter.angularProjector2[iP]); 962 | 963 | Vf j_normalLimiter_compMass1_linearX = Vf::load(&jointP.normalLimiter.compMass1_linearX[iP]); 964 | Vf j_normalLimiter_compMass1_linearY = Vf::load(&jointP.normalLimiter.compMass1_linearY[iP]); 965 | Vf j_normalLimiter_compMass2_linearX = Vf::load(&jointP.normalLimiter.compMass2_linearX[iP]); 966 | Vf j_normalLimiter_compMass2_linearY = Vf::load(&jointP.normalLimiter.compMass2_linearY[iP]); 967 | Vf j_normalLimiter_compMass1_angular = Vf::load(&jointP.normalLimiter.compMass1_angular[iP]); 968 | Vf j_normalLimiter_compMass2_angular = Vf::load(&jointP.normalLimiter.compMass2_angular[iP]); 969 | Vf j_normalLimiter_compInvMass = Vf::load(&jointP.normalLimiter.compInvMass[iP]); 970 | Vf j_normalLimiter_dstDisplacingVelocity = Vf::load(&jointP.normalLimiter_dstDisplacingVelocity[iP]); 971 | Vf j_normalLimiter_accumulatedDisplacingImpulse = Vf::load(&jointP.normalLimiter_accumulatedDisplacingImpulse[iP]); 972 | 973 | Vf dV = j_normalLimiter_dstDisplacingVelocity; 974 | 975 | dV -= j_normalLimiter_normalProjector1X * body1_velocityX; 976 | dV -= j_normalLimiter_normalProjector1Y * body1_velocityY; 977 | dV -= j_normalLimiter_angularProjector1 * body1_angularVelocity; 978 | 979 | dV -= j_normalLimiter_normalProjector2X * body2_velocityX; 980 | dV -= j_normalLimiter_normalProjector2Y * body2_velocityY; 981 | dV -= j_normalLimiter_angularProjector2 * body2_angularVelocity; 982 | 983 | Vf displacingDeltaImpulse = dV * j_normalLimiter_compInvMass; 984 | 985 | displacingDeltaImpulse = max(displacingDeltaImpulse, -j_normalLimiter_accumulatedDisplacingImpulse); 986 | 987 | body1_velocityX += j_normalLimiter_compMass1_linearX * displacingDeltaImpulse; 988 | body1_velocityY += j_normalLimiter_compMass1_linearY * displacingDeltaImpulse; 989 | body1_angularVelocity += j_normalLimiter_compMass1_angular * displacingDeltaImpulse; 990 | 991 | body2_velocityX += j_normalLimiter_compMass2_linearX * displacingDeltaImpulse; 992 | body2_velocityY += j_normalLimiter_compMass2_linearY * displacingDeltaImpulse; 993 | body2_angularVelocity += j_normalLimiter_compMass2_angular * displacingDeltaImpulse; 994 | 995 | j_normalLimiter_accumulatedDisplacingImpulse += displacingDeltaImpulse; 996 | 997 | store(j_normalLimiter_accumulatedDisplacingImpulse, &jointP.normalLimiter_accumulatedDisplacingImpulse[iP]); 998 | 999 | Vb productive = abs(displacingDeltaImpulse) > Vf::one(kProductiveImpulse); 1000 | 1001 | productive_any |= productive; 1002 | 1003 | body1_lastIteration = select(body1_lastIteration, iterationIndex0, productive); 1004 | body2_lastIteration = select(body2_lastIteration, iterationIndex0, productive); 1005 | 1006 | // this is a bit painful :( 1007 | body1_lastIterationf = bitcast(body1_lastIteration); 1008 | body2_lastIterationf = bitcast(body2_lastIteration); 1009 | 1010 | storeindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf, 1011 | solveBodiesDisplacement.data, jointP.body1Index + iP, sizeof(SolveBody)); 1012 | 1013 | storeindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf, 1014 | solveBodiesDisplacement.data, jointP.body2Index + iP, sizeof(SolveBody)); 1015 | } 1016 | 1017 | return any(productive_any); 1018 | } -------------------------------------------------------------------------------- /src/Solver.h: -------------------------------------------------------------------------------- 1 | #include "Joints.h" 2 | #include 3 | #include 4 | 5 | #include "base/AlignedArray.h" 6 | 7 | template 8 | struct ContactLimiterPacked 9 | { 10 | float normalProjector1X[N]; 11 | float normalProjector1Y[N]; 12 | float normalProjector2X[N]; 13 | float normalProjector2Y[N]; 14 | float angularProjector1[N]; 15 | float angularProjector2[N]; 16 | 17 | float compMass1_linearX[N]; 18 | float compMass1_linearY[N]; 19 | float compMass2_linearX[N]; 20 | float compMass2_linearY[N]; 21 | float compMass1_angular[N]; 22 | float compMass2_angular[N]; 23 | float compInvMass[N]; 24 | }; 25 | 26 | template 27 | struct ContactJointPacked 28 | { 29 | int body1Index[N]; 30 | int body2Index[N]; 31 | int contactPointIndex[N]; 32 | 33 | ContactLimiterPacked normalLimiter; 34 | 35 | float normalLimiter_compInvMass[N]; 36 | float normalLimiter_accumulatedImpulse[N]; 37 | 38 | float normalLimiter_dstVelocity[N]; 39 | float normalLimiter_dstDisplacingVelocity[N]; 40 | float normalLimiter_accumulatedDisplacingImpulse[N]; 41 | 42 | ContactLimiterPacked frictionLimiter; 43 | 44 | float frictionLimiter_accumulatedImpulse[N]; 45 | }; 46 | 47 | class WorkQueue; 48 | struct Configuration; 49 | 50 | struct Solver 51 | { 52 | Solver(); 53 | 54 | void SolveJoints(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration); 55 | 56 | void SolveJoints_Scalar(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration); 57 | void SolveJoints_SSE2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration); 58 | void SolveJoints_AVX2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration); 59 | 60 | template 61 | void SolveJoints(WorkQueue& queue, AlignedArray>& joint_packed, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration); 62 | 63 | int GatherIslands(RigidBody* bodies, int bodiesCount, int groupSizeTarget); 64 | void PrepareBodies(RigidBody* bodies, int bodiesCount); 65 | void FinishBodies(RigidBody* bodies, int bodiesCount); 66 | 67 | template 68 | void SolveJointIsland(WorkQueue& queue, AlignedArray>& joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints, const Configuration& configuration); 69 | 70 | template 71 | int PrepareJoints(WorkQueue& queue, AlignedArray>& joint_packed, int jointBegin, int jointEnd, int groupSizeTarget); 72 | template 73 | void FinishJoints(WorkQueue& queue, AlignedArray>& joint_packed, int jointBegin, int jointEnd); 74 | 75 | int PrepareIndices(int jointBegin, int jointEnd, int groupSizeTarget); 76 | 77 | template 78 | void RefreshJoints(ContactJointPacked* joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints); 79 | template 80 | void PreStepJoints(ContactJointPacked* joint_packed, int jointBegin, int jointEnd); 81 | template 82 | bool SolveJointsImpulses(ContactJointPacked* joint_packed, int jointBegin, int jointEnd, int iterationIndex); 83 | template 84 | bool SolveJointsDisplacement(ContactJointPacked* joint_packed, int jointBegin, int jointEnd, int iterationIndex); 85 | 86 | struct SolveBodyParams 87 | { 88 | float invMass; 89 | float invInertia; 90 | 91 | Vector2f coords_pos; 92 | 93 | Vector2f coords_xVector; 94 | Vector2f coords_yVector; 95 | }; 96 | 97 | struct SolveBody 98 | { 99 | Vector2f velocity; 100 | float angularVelocity; 101 | 102 | int lastIteration; 103 | }; 104 | 105 | int islandCount; 106 | int islandMaxSize; 107 | 108 | AlignedArray solveBodiesParams; 109 | AlignedArray solveBodiesImpulse; 110 | AlignedArray solveBodiesDisplacement; 111 | 112 | AlignedArray contactJoints; 113 | 114 | AlignedArray jointGroup_bodies; 115 | AlignedArray jointGroup_joints; 116 | 117 | AlignedArray joint_index; 118 | 119 | AlignedArray island_remap; 120 | AlignedArray island_index; 121 | AlignedArray island_indexremap; 122 | AlignedArray island_offset; 123 | AlignedArray island_offsettemp; 124 | AlignedArray island_size; 125 | 126 | AlignedArray> joint_packed1; 127 | AlignedArray> joint_packed4; 128 | AlignedArray> joint_packed8; 129 | }; -------------------------------------------------------------------------------- /src/Vector2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | struct Vector2 8 | { 9 | T x, y; 10 | T& operator[](const int i) 11 | { 12 | return *(&(x) + i); 13 | } 14 | inline Vector2() {} 15 | //inline Vector3d(const Vector3d & rhs) { *this = rhs; } 16 | inline Vector2(const T& _x, const T& _y) 17 | : x(_x) 18 | , y(_y) 19 | { 20 | } 21 | inline T Len() const 22 | { 23 | return sqrt(x * x + y * y); 24 | } 25 | 26 | inline T SquareLen() const 27 | { 28 | return x * x + y * y; 29 | } 30 | 31 | inline void Normalize() //Normalize itself 32 | { 33 | T l = sqrtf(x * x + y * y); 34 | if (fabs(l) > T(1e-8)) 35 | { 36 | T k = T(1.0) / l; 37 | x *= k; 38 | y *= k; 39 | } 40 | } 41 | 42 | inline void Invert() 43 | { 44 | x = -x; 45 | y = -y; 46 | } 47 | 48 | void Rotate(const T angle) 49 | { 50 | Vector2 self = *this; 51 | Vector2 x = self; 52 | Vector2 y = Vector2(-x.y, x.x); 53 | Vector2 delta = x * cos(angle) + y * sin(angle) - x; 54 | self += delta; 55 | *this = self; 56 | } 57 | 58 | inline Vector2 GetNorm() const 59 | { 60 | T l = sqrt(x * x + y * y); 61 | if (fabs(l) > T(1e-8)) 62 | { 63 | T k = T(1.0) / l; 64 | return Vector2(x * k, y * k); 65 | } 66 | else 67 | { 68 | return Vector2(0, 0); 69 | } 70 | } 71 | 72 | inline Vector2 operator-() const 73 | { 74 | return Vector2(-x, -y); 75 | } 76 | 77 | void Decrease(T val) 78 | { 79 | if (SquareLen() > val * val) 80 | { 81 | T len = Len(); 82 | T scale = (len - val) / len; 83 | x *= scale; 84 | y *= scale; 85 | } 86 | else 87 | { 88 | x = 0.0f; 89 | y = 0.0f; 90 | } 91 | } 92 | 93 | inline Vector2& operator*=(const T& val) 94 | { 95 | x *= val; 96 | y *= val; 97 | return *this; 98 | } 99 | 100 | inline Vector2& operator/=(const T& val) 101 | { 102 | T inv = T(1.0) / val; 103 | x *= inv; 104 | y *= inv; 105 | return *this; 106 | } 107 | inline Vector2& operator+=(const Vector2& vec) 108 | { 109 | x += vec.x; 110 | y += vec.y; 111 | return *this; 112 | } 113 | inline Vector2& operator-=(const Vector2& vec) 114 | { 115 | x -= vec.x; 116 | y -= vec.y; 117 | return *this; 118 | } 119 | inline Vector2& operator--() 120 | { 121 | x = -x; 122 | y = -y; 123 | return *this; 124 | } 125 | 126 | inline Vector2 operator+(const Vector2& vec) const 127 | { 128 | return Vector2(x + vec.x, y + vec.y); 129 | } 130 | inline Vector2 operator-(const Vector2& vec) const 131 | { 132 | return Vector2(x - vec.x, y - vec.y); 133 | } 134 | inline T operator*(const Vector2& vec) const 135 | { 136 | return x * vec.x + y * vec.y; 137 | } 138 | inline Vector2 operator*(const T& val) const 139 | { 140 | return Vector2(x * val, y * val); 141 | } 142 | Vector2 GetPerpendicular() const 143 | { 144 | return Vector2(-y, x); 145 | } 146 | 147 | template 148 | inline const Vector2 operator=(const SomeVector& v) 149 | { 150 | x = v.x; 151 | y = v.y; 152 | return *this; 153 | } 154 | 155 | static const Vector2 zeroVector() 156 | { 157 | return Vector2(0, 0); 158 | } 159 | 160 | static const Vector2 zero() 161 | { 162 | return zeroVector(); 163 | } 164 | 165 | static const Vector2 one() 166 | { 167 | return Vector2(T(1.0), T(1.0)); 168 | } 169 | 170 | static const Vector2 xAxis() 171 | { 172 | return Vector2(T(1.0), 0); 173 | } 174 | 175 | static const Vector2 yAxis() 176 | { 177 | return Vector2(0, T(1.0)); 178 | } 179 | }; 180 | 181 | template 182 | inline Vector2 operator*(const T& d, const Vector2& V) 183 | { 184 | return Vector2(V.x * d, V.y * d); 185 | } 186 | 187 | template 188 | inline Vector2 operator/(const Vector2& V, const T& d) 189 | { 190 | T invd = T(1.0) / d; 191 | return Vector2(V.x * invd, V.y * invd); 192 | } 193 | 194 | template 195 | inline T operator^(const Vector2& v1, const Vector2& v2) 196 | { 197 | return v1.x * v2.y - v1.y * v2.x; 198 | } 199 | 200 | typedef Vector2 Vector2f; 201 | typedef Vector2 Vector2d; 202 | 203 | const static Vector2f zeroVector2f = Vector2f(0, 0); 204 | const static Vector2f xAxis2f = Vector2f(1, 0); 205 | const static Vector2f yAxis2f = Vector2f(0, 1); 206 | 207 | const static Vector2d zeroVector2d = Vector2d(0, 0); 208 | const static Vector2d xAxis2d = Vector2d(1, 0); 209 | const static Vector2d yAxis2d = Vector2d(0, 1); 210 | 211 | template 212 | bool GetTwoLinesIntersection(const Vector2& p1, const Vector2& p2, const Vector2& t1, const Vector2& t2, Vector2& p0) 213 | { 214 | Vector2 v1, v2; 215 | T k1, k2; 216 | v1 = p2 - p1; 217 | v2 = t2 - t1; 218 | T invmul; 219 | T mul = v1 ^ v2; 220 | if (fabs(mul) > T(1e-5)) 221 | { 222 | invmul = 1.0f / (v1 ^ v2); 223 | k2 = ((t1 ^ v1) - (p1 ^ v1)) * invmul; /*p1.x * v1.y - p1.y * v1.x + t1.y * v1.x - t1.x * v1.y*/ 224 | k1 = ((t1 ^ v2) - (p1 ^ v2)) * invmul; 225 | p0 = p1 + v1 * k1; 226 | // Vector p02 = t1 + v2 * k2; 227 | return ((k1 > 0.0f) && (k1 < 1.0f) && (k2 > 0.0f) && (k2 < 1.0f)); 228 | } 229 | else 230 | { 231 | return 0; 232 | } 233 | p0 = t1 + (t1 - t2); //100% bad point 234 | return 0; 235 | } 236 | 237 | template 238 | bool ProjectPointToLine(const Vector2& t1, const Vector2& t2, const Vector2& p, Vector2& p0, T& signOfSide) 239 | { 240 | Vector2 v1 = p - t1; 241 | Vector2 v2 = t2 - t1; 242 | signOfSide = sgn(v1 ^ v2); 243 | p0 = t1 + v2 * ((v1 * v2) / v2.SquareLen()); 244 | if ((v1 * v2 >= 0.0f) && ((v1 * v2) / (v2.SquareLen()) <= 1.0f)) 245 | { 246 | return 1; 247 | } 248 | else 249 | { 250 | return 0; 251 | } 252 | } 253 | 254 | template 255 | bool ProjectPointToLine(const Vector2& t1, const Vector2& t2, const Vector2& p, Vector2& p0) 256 | { 257 | T signOfSide; 258 | return ProjectPointToLine(t1, t2, p, p0, signOfSide); 259 | } 260 | 261 | template 262 | T PointToSegmentDistanse(const Vector2& t1, const Vector2& t2, const Vector2& p) 263 | { 264 | Vector2 p0; 265 | T signOfSide; 266 | if (ProjectPointToLine(t1, t2, p, p0, signOfSide)) 267 | { 268 | return Vector2(p.x - p0.x, p.y - p0.y).Len(); 269 | } 270 | else 271 | { 272 | return Min(Vector2(p.x - t1.x, p.y - t1.y).Len(), 273 | Vector2(p.x - t2.x, p.y - t2.y).Len()); 274 | } 275 | } 276 | 277 | template 278 | void ProjectPointToLine(const Vector2& point, const Vector2& planePoint, const Vector2& planeNormal, const Vector2& projectionDirection, 279 | Vector2f& projectedPoint) 280 | { 281 | float mult = 1.0f / (projectionDirection * planeNormal); 282 | projectedPoint = point + projectionDirection * ((planePoint * planeNormal) - (point * planeNormal)) * mult; 283 | } 284 | 285 | template 286 | void ProjectPointToPlane(const Vector2& point, const Vector2& planePoint, const Vector2& planeNormal, const Vector2& projectionDirection, 287 | Vector2f& projectedPoint) 288 | { 289 | ProjectPointToLine(point, planePoint, planeNormal, projectionDirection, projectedPoint); 290 | } 291 | 292 | template 293 | void ProjectPointToLine_noreturn(const Vector2& t1, const Vector2& t2, const Vector2& p, Vector2& p0, T& signOfSide) 294 | { 295 | Vector2 v1 = p - t1; 296 | Vector2 v2 = t2 - t1; 297 | signOfSide = sgn(v1 ^ v2); 298 | p0 = t1 + v2 * ((v1 * v2) / v2.SquareLen()); 299 | } 300 | 301 | template 302 | bool IsPointInCellEx(const typename GeomSpace::Vector2 points[3], typename GeomSpace::Vector2 testPoint, typename GeomSpace::Scalar eps = 0) 303 | { 304 | typedef typename GeomSpace::Scalar Scalar; 305 | 306 | Scalar side0 = ((points[1] - points[0]) ^ (testPoint - points[0])); 307 | Scalar side1 = ((points[2] - points[1]) ^ (testPoint - points[1])); 308 | Scalar side2 = ((points[0] - points[2]) ^ (testPoint - points[2])); 309 | 310 | if (side0 >= -eps && side1 >= -eps && side2 >= -eps) return 1; 311 | if (side0 <= eps && side1 <= eps && side2 <= eps) return 1; 312 | return 0; 313 | } 314 | 315 | template 316 | bool IsPointInCell(const typename GeomSpace::Vector2 points[3], typename GeomSpace::Vector2 testPoint) 317 | { 318 | typedef typename GeomSpace::Scalar Scalar; 319 | //Scalar //eps = 0;//-1e-4;//std::numeric_limits::epsilon();//Scalar(1e-9); 320 | Scalar eps = std::numeric_limits::epsilon(); 321 | 322 | Scalar side0 = ((points[1] - points[0]) ^ (testPoint - points[0])); 323 | Scalar side1 = ((points[2] - points[1]) ^ (testPoint - points[1])); 324 | Scalar side2 = ((points[0] - points[2]) ^ (testPoint - points[2])); 325 | 326 | if (side0 >= -eps && side1 >= -eps && side2 >= -eps) return 1; 327 | if (side0 <= eps && side1 <= eps && side2 <= eps) return 1; 328 | return 0; 329 | 330 | /*Scalar eps = std::numeric_limits::epsilon();//Scalar(1e-9); 331 | Scalar side012 = mixed_product(points[1] - points[0], points[2] - points[0], testPoint - points[0]) * 332 | mixed_product(points[1] - points[0], points[2] - points[0], points[3] - points[0]); 333 | if(side012 < -eps) return 0; 334 | 335 | Scalar side123 = mixed_product(points[1] - points[2], points[3] - points[2], testPoint - points[2]) * 336 | mixed_product(points[1] - points[2], points[3] - points[2], points[0] - points[2]); 337 | if(side123 < -eps) return 0; 338 | 339 | Scalar side230 = mixed_product(points[2] - points[3], points[0] - points[3], testPoint - points[3]) * 340 | mixed_product(points[2] - points[3], points[0] - points[3], points[1] - points[3]); 341 | if(side230 < -eps) return 0; 342 | 343 | Scalar side013 = mixed_product(points[0] - points[1], points[3] - points[1], testPoint - points[1]) * 344 | mixed_product(points[0] - points[1], points[3] - points[1], points[2] - points[1]); 345 | if(side013 < -eps) return 0; 346 | 347 | return 1;*/ 348 | 349 | /*Scalar side1 = mixed_product(points[2] - points[0], points[3] - points[0], testPoint - points[0]); 350 | Scalar side2 = mixed_product(points[3] - points[0], points[1] - points[0], testPoint - points[0]); 351 | Scalar side3 = mixed_product(points[3] - points[1], points[2] - points[1], testPoint - points[1]); 352 | if (side0 * side1 < 0) return 0; 353 | if (side1 * side2 < 0) return 0; 354 | if (side2 * side3 < 0) return 0; 355 | 356 | return 1;*/ 357 | } 358 | -------------------------------------------------------------------------------- /src/World.cpp: -------------------------------------------------------------------------------- 1 | #include "World.h" 2 | 3 | #include "base/Parallel.h" 4 | #include "microprofile.h" 5 | 6 | World::World() 7 | : gravity(0) 8 | { 9 | } 10 | 11 | RigidBody* World::AddBody(Coords2f coords, Vector2f size) 12 | { 13 | RigidBody newbie(coords, size, 1e-5f); 14 | newbie.index = bodies.size; 15 | bodies.push_back(newbie); 16 | return &(bodies[bodies.size - 1]); 17 | } 18 | 19 | void World::Update(WorkQueue& queue, float dt, const Configuration& configuration) 20 | { 21 | MICROPROFILE_SCOPEI("Physics", "Update", 0x00ff00); 22 | 23 | collisionTime = mergeTime = solveTime = 0; 24 | 25 | IntegrateVelocity(queue, dt); 26 | 27 | collider.UpdateBroadphase(bodies.data, bodies.size); 28 | collider.UpdatePairs(queue, bodies.data, bodies.size); 29 | collider.UpdateManifolds(queue, bodies.data); 30 | collider.PackManifolds(bodies.data); 31 | 32 | RefreshContactJoints(); 33 | 34 | solver.SolveJoints(queue, bodies.data, bodies.size, collider.contactPoints.data, configuration); 35 | 36 | IntegratePosition(queue, dt); 37 | } 38 | 39 | NOINLINE void World::IntegrateVelocity(WorkQueue& queue, float dt) 40 | { 41 | MICROPROFILE_SCOPEI("Physics", "IntegrateVelocity", -1); 42 | 43 | parallelFor(queue, bodies.data, bodies.size, 32, [this, dt](RigidBody& body, int) { 44 | if (body.invMass > 0.0f) 45 | { 46 | body.acceleration.y += gravity; 47 | } 48 | 49 | body.velocity += body.acceleration * dt; 50 | body.acceleration = Vector2f(0.0f, 0.0f); 51 | 52 | body.angularVelocity += body.angularAcceleration * dt; 53 | body.angularAcceleration = 0.0f; 54 | }); 55 | } 56 | 57 | NOINLINE void World::IntegratePosition(WorkQueue& queue, float dt) 58 | { 59 | MICROPROFILE_SCOPEI("Physics", "IntegratePosition", -1); 60 | 61 | parallelFor(queue, bodies.data, bodies.size, 32, [dt](RigidBody& body, int) { 62 | body.coords.pos += body.displacingVelocity + body.velocity * dt; 63 | body.coords.Rotate(-(body.displacingAngularVelocity + body.angularVelocity * dt)); 64 | 65 | body.displacingVelocity = Vector2f(0.0f, 0.0f); 66 | body.displacingAngularVelocity = 0.0f; 67 | 68 | body.UpdateGeom(); 69 | }); 70 | } 71 | 72 | NOINLINE void World::RefreshContactJoints() 73 | { 74 | MICROPROFILE_SCOPEI("Physics", "RefreshContactJoints", -1); 75 | 76 | int matched = 0; 77 | int created = 0; 78 | int deleted = 0; 79 | 80 | { 81 | MICROPROFILE_SCOPEI("Physics", "Reset", -1); 82 | 83 | for (int jointIndex = 0; jointIndex < solver.contactJoints.size; jointIndex++) 84 | { 85 | solver.contactJoints[jointIndex].contactPointIndex = -1; 86 | } 87 | } 88 | 89 | { 90 | MICROPROFILE_SCOPEI("Physics", "Match", -1); 91 | 92 | for (int manifoldIndex = 0; manifoldIndex < collider.manifolds.size; ++manifoldIndex) 93 | { 94 | Manifold& man = collider.manifolds[manifoldIndex]; 95 | 96 | for (int collisionIndex = 0; collisionIndex < man.pointCount; collisionIndex++) 97 | { 98 | int contactPointIndex = man.pointIndex + collisionIndex; 99 | ContactPoint& col = collider.contactPoints[contactPointIndex]; 100 | 101 | if (col.solverIndex < 0) 102 | { 103 | col.solverIndex = solver.contactJoints.size; 104 | 105 | solver.contactJoints.push_back(ContactJoint(man.body1Index, man.body2Index, contactPointIndex)); 106 | 107 | created++; 108 | } 109 | else 110 | { 111 | ContactJoint& joint = solver.contactJoints[col.solverIndex]; 112 | 113 | assert(joint.body1Index == man.body1Index); 114 | assert(joint.body2Index == man.body2Index); 115 | 116 | joint.contactPointIndex = contactPointIndex; 117 | 118 | matched++; 119 | } 120 | } 121 | } 122 | } 123 | 124 | { 125 | MICROPROFILE_SCOPEI("Physics", "Cleanup", -1); 126 | 127 | for (int jointIndex = 0; jointIndex < solver.contactJoints.size;) 128 | { 129 | ContactJoint& joint = solver.contactJoints[jointIndex]; 130 | 131 | if (joint.contactPointIndex < 0) 132 | { 133 | joint = solver.contactJoints[solver.contactJoints.size - 1]; 134 | solver.contactJoints.size--; 135 | 136 | deleted++; 137 | } 138 | else 139 | { 140 | collider.contactPoints[joint.contactPointIndex].solverIndex = jointIndex; 141 | jointIndex++; 142 | } 143 | } 144 | } 145 | 146 | MICROPROFILE_META_CPU("Matched", matched); 147 | MICROPROFILE_META_CPU("Created", created); 148 | MICROPROFILE_META_CPU("Deleted", deleted); 149 | } -------------------------------------------------------------------------------- /src/World.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "RigidBody.h" 4 | #include "Collider.h" 5 | #include "Solver.h" 6 | 7 | struct Configuration; 8 | 9 | struct World 10 | { 11 | enum SolveMode 12 | { 13 | Solve_Scalar, 14 | Solve_SSE2, 15 | Solve_AVX2, 16 | }; 17 | 18 | World(); 19 | 20 | RigidBody* AddBody(Coords2f coords, Vector2f size); 21 | 22 | void Update(WorkQueue& queue, float dt, const Configuration& configuration); 23 | 24 | NOINLINE void IntegrateVelocity(WorkQueue& queue, float dt); 25 | NOINLINE void IntegratePosition(WorkQueue& queue, float dt); 26 | NOINLINE void RefreshContactJoints(); 27 | 28 | float collisionTime; 29 | float mergeTime; 30 | float solveTime; 31 | 32 | AlignedArray bodies; 33 | Collider collider; 34 | Solver solver; 35 | 36 | float gravity; 37 | }; -------------------------------------------------------------------------------- /src/base/AlignedArray.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | struct AlignedArray 8 | { 9 | T* data; 10 | int size; 11 | int capacity; 12 | 13 | AlignedArray() 14 | : data(0) 15 | , size(0) 16 | , capacity(0) 17 | { 18 | } 19 | 20 | ~AlignedArray() 21 | { 22 | _mm_free(data); 23 | } 24 | 25 | AlignedArray(const AlignedArray&) = delete; 26 | AlignedArray& operator=(const AlignedArray&) = delete; 27 | 28 | AlignedArray(AlignedArray&& other) 29 | { 30 | data = other.data; 31 | size = other.size; 32 | capacity = other.capacity; 33 | 34 | other.data = 0; 35 | other.size = 0; 36 | other.capacity = 0; 37 | } 38 | 39 | AlignedArray& operator=(AlignedArray&& other) 40 | { 41 | _mm_free(data); 42 | 43 | data = other.data; 44 | size = other.size; 45 | capacity = other.capacity; 46 | 47 | other.data = 0; 48 | other.size = 0; 49 | other.capacity = 0; 50 | 51 | return *this; 52 | } 53 | 54 | T* begin() 55 | { 56 | return data; 57 | } 58 | 59 | T* end() 60 | { 61 | return data + size; 62 | } 63 | 64 | T& operator[](int i) 65 | { 66 | assert(i >= 0 && i < size); 67 | return data[i]; 68 | } 69 | 70 | const T& operator[](int i) const 71 | { 72 | assert(i >= 0 && i < size); 73 | return data[i]; 74 | } 75 | 76 | void push_back(const T& value) 77 | { 78 | if (size == capacity) 79 | { 80 | T copy = value; 81 | 82 | realloc(size + 1, true); 83 | 84 | data[size++] = copy; 85 | } 86 | else 87 | { 88 | data[size++] = value; 89 | } 90 | } 91 | 92 | void truncate(int newsize) 93 | { 94 | assert(newsize <= size); 95 | 96 | size = newsize; 97 | } 98 | 99 | void resize_copy(int newsize) 100 | { 101 | if (newsize > capacity) 102 | realloc(newsize, true); 103 | 104 | size = newsize; 105 | } 106 | 107 | void resize(int newsize) 108 | { 109 | if (newsize > capacity) 110 | realloc(newsize, false); 111 | 112 | size = newsize; 113 | } 114 | 115 | void realloc(int newsize, bool copy) 116 | { 117 | int newcapacity = capacity; 118 | while (newcapacity < newsize) 119 | newcapacity += newcapacity / 2 + 1; 120 | 121 | // Leave 32b padding at the end to avoid buffer overruns for fast SIMD code 122 | T* newdata = static_cast(_mm_malloc(newcapacity * sizeof(T) + 32, 32)); 123 | 124 | if (data) 125 | { 126 | if (copy) 127 | memcpy(newdata, data, size * sizeof(T)); 128 | _mm_free(data); 129 | } 130 | 131 | data = newdata; 132 | capacity = newcapacity; 133 | } 134 | 135 | void clear() 136 | { 137 | size = 0; 138 | } 139 | }; 140 | -------------------------------------------------------------------------------- /src/base/DenseHash.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Internal implementation of DenseHashSet and DenseHashMap 9 | namespace detail 10 | { 11 | template 12 | class DenseHashTable 13 | { 14 | public: 15 | typedef typename std::vector::const_iterator const_iterator; 16 | 17 | DenseHashTable(size_t capacity, const Hash& hash, const Eq& eq) 18 | : filled(0) 19 | , hash(hash) 20 | , eq(eq) 21 | { 22 | if (capacity) 23 | { 24 | items.reserve(capacity); 25 | rehash(capacity); 26 | } 27 | } 28 | 29 | const_iterator begin() const 30 | { 31 | return items.begin(); 32 | } 33 | 34 | const_iterator end() const 35 | { 36 | return items.end(); 37 | } 38 | 39 | bool empty() const 40 | { 41 | return items.empty(); 42 | } 43 | 44 | size_t size() const 45 | { 46 | return items.size(); 47 | } 48 | 49 | size_t bucket_count() const 50 | { 51 | return buckets.size(); 52 | } 53 | 54 | float load_factor() const 55 | { 56 | return buckets.empty() ? 0 : float(filled) / float(buckets.size()); 57 | } 58 | 59 | void clear() 60 | { 61 | items.clear(); 62 | buckets.clear(); 63 | filled = 0; 64 | } 65 | 66 | protected: 67 | std::vector items; 68 | std::vector buckets; 69 | size_t filled; // number of non-empty buckets 70 | 71 | Hash hash; 72 | Eq eq; 73 | 74 | void rehash() 75 | { 76 | if (filled >= buckets.size() * 3 / 4) 77 | { 78 | rehash(items.size() * 2); 79 | } 80 | } 81 | 82 | void rehash(size_t capacity) 83 | { 84 | size_t newbuckets = 16; 85 | while (newbuckets < capacity) newbuckets *= 2; 86 | 87 | size_t hashmod = newbuckets - 1; 88 | 89 | std::vector(newbuckets, -1).swap(buckets); 90 | 91 | for (size_t i = 0; i < items.size(); ++i) 92 | { 93 | size_t bucket = hash(getKey(items[i])) & hashmod; 94 | 95 | for (size_t probe = 0; probe <= hashmod; ++probe) 96 | { 97 | if (buckets[bucket] < 0) 98 | { 99 | buckets[bucket] = i; 100 | break; 101 | } 102 | 103 | // Hash collision, quadratic probing 104 | bucket = (bucket + probe + 1) & hashmod; 105 | } 106 | } 107 | 108 | filled = items.size(); 109 | } 110 | 111 | int find_bucket(const Key& key) const 112 | { 113 | if (buckets.empty()) 114 | return -1; 115 | 116 | size_t hashmod = buckets.size() - 1; 117 | size_t bucket = hash(key) & hashmod; 118 | 119 | for (size_t probe = 0; probe <= hashmod; ++probe) 120 | { 121 | int32_t probe_index = buckets[bucket]; 122 | 123 | // Element does not exist, insert here 124 | if (probe_index == -1) 125 | return -1; 126 | 127 | // Not a tombstone and key matches 128 | if (probe_index >= 0 && eq(getKey(items[probe_index]), key)) 129 | return bucket; 130 | 131 | // Hash collision, quadratic probing 132 | bucket = (bucket + probe + 1) & hashmod; 133 | } 134 | 135 | // Hash table is full - this should not happen 136 | assert(false); 137 | return -1; 138 | } 139 | 140 | std::pair insert_item(const Key& key) 141 | { 142 | assert(!buckets.empty()); 143 | 144 | size_t hashmod = buckets.size() - 1; 145 | size_t bucket = hash(key) & hashmod; 146 | 147 | for (size_t probe = 0; probe <= hashmod; ++probe) 148 | { 149 | int32_t probe_index = buckets[bucket]; 150 | 151 | // Element does not exist or a tombstone, insert here 152 | // TODO: this is incorrect! we have to follow the chain of tombstones to the end just in case our element does exist :( 153 | if (probe_index < 0) 154 | { 155 | buckets[bucket] = items.size(); 156 | filled += probe_index == -1; 157 | 158 | items.push_back(Item()); 159 | getKey(items.back()) = key; 160 | 161 | return std::make_pair(&items.back(), true); 162 | } 163 | 164 | // Key matches, insert here 165 | if (eq(getKey(items[probe_index]), key)) 166 | return std::make_pair(&items[probe_index], false); 167 | 168 | // Hash collision, quadratic probing 169 | bucket = (bucket + probe + 1) & hashmod; 170 | } 171 | 172 | // Hash table is full - this should not happen 173 | assert(false); 174 | return std::make_pair(static_cast(0), false); 175 | } 176 | 177 | void erase_bucket(int bucket) 178 | { 179 | assert(bucket >= 0); 180 | 181 | int32_t probe_index = buckets[bucket]; 182 | assert(probe_index >= 0); 183 | 184 | // move last key 185 | // TODO: this is suboptimal! we don't need to compare keys when searching for this, it's enough to find a bucket that points to items.size()-1 186 | int probe_bucket = find_bucket(getKey(items.back())); 187 | assert(probe_bucket >= 0); 188 | assert(buckets[probe_bucket] == int32_t(items.size() - 1)); 189 | 190 | items[probe_index] = items.back(); 191 | buckets[probe_bucket] = probe_index; 192 | 193 | items.pop_back(); 194 | 195 | // mark bucket as tombstone 196 | buckets[bucket] = -2; 197 | } 198 | 199 | private: 200 | // Interface to support both key and pair 201 | static const Key& getKey(const Key& item) { return item; } 202 | static Key& getKey(Key& item) { return item; } 203 | template static const Key& getKey(const std::pair& item) { return item.first; } 204 | template static Key& getKey(std::pair& item) { return item.first; } 205 | }; 206 | } 207 | 208 | template , typename Eq = std::equal_to> 209 | class DenseHashSet: public detail::DenseHashTable 210 | { 211 | public: 212 | explicit DenseHashSet(size_t capacity = 0, const Hash& hash = Hash(), const Eq& eq = Eq()) 213 | : detail::DenseHashTable(capacity, hash, eq) 214 | { 215 | } 216 | 217 | bool contains(const Key& key) const 218 | { 219 | return this->find_bucket(key) >= 0; 220 | } 221 | 222 | bool insert(const Key& key) 223 | { 224 | this->rehash(); 225 | 226 | return this->insert_item(key).second; 227 | } 228 | 229 | void erase(const Key& key) 230 | { 231 | int bucket = this->find_bucket(key); 232 | 233 | if (bucket >= 0) 234 | this->erase_bucket(bucket); 235 | } 236 | }; 237 | 238 | // This is a faster alternative of std::unordered_map, but it does not implement the same interface (i.e. it does not support erasing and has contains() instead of find()) 239 | template , typename Eq = std::equal_to> 240 | class DenseHashMap: public detail::DenseHashTable, Hash, Eq> 241 | { 242 | public: 243 | explicit DenseHashMap(size_t capacity = 0, const Hash& hash = Hash(), const Eq& eq = Eq()) 244 | : detail::DenseHashTable, Hash, Eq>(capacity, hash, eq) 245 | { 246 | } 247 | 248 | const Value* find(const Key& key) const 249 | { 250 | int bucket = this->find_bucket(key); 251 | 252 | return bucket < 0 ? NULL : &this->items[this->buckets[bucket]].second; 253 | } 254 | 255 | Value& operator[](const Key& key) 256 | { 257 | this->rehash(); 258 | 259 | return this->insert_item(key).first->second; 260 | } 261 | 262 | void erase(const Key& key) 263 | { 264 | int bucket = this->find_bucket(key); 265 | 266 | if (bucket >= 0) 267 | this->erase_bucket(bucket); 268 | } 269 | }; 270 | -------------------------------------------------------------------------------- /src/base/Parallel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "WorkQueue.h" 4 | 5 | #include 6 | 7 | #include "microprofile.h" 8 | 9 | template 10 | inline T& parallelForIndex(T* data, unsigned int index) 11 | { 12 | return data[index]; 13 | } 14 | 15 | inline unsigned int parallelForIndex(unsigned int data, unsigned int index) 16 | { 17 | return data + index; 18 | } 19 | 20 | template 21 | inline void serialFor(WorkQueue& queue, T data, unsigned int count, unsigned int groupSize, F func) 22 | { 23 | for (unsigned int i = 0; i < count; ++i) 24 | func(parallelForIndex(data, i), 0); 25 | } 26 | 27 | template 28 | inline void parallelFor(WorkQueue& queue, T data, unsigned int count, unsigned int groupSize, F func) 29 | { 30 | if (queue.getWorkerCount() == 0 || count <= groupSize) 31 | { 32 | for (unsigned int i = 0; i < count; ++i) 33 | func(parallelForIndex(data, i), 0); 34 | 35 | return; 36 | } 37 | 38 | MICROPROFILE_SCOPEI("WorkQueue", "ParallelFor", 0x808080); 39 | 40 | struct Item: WorkQueue::Item 41 | { 42 | WorkQueue* queue; 43 | 44 | std::atomic counter; 45 | std::atomic ready; 46 | 47 | T data; 48 | unsigned int count; 49 | 50 | unsigned int groupSize; 51 | unsigned int groupCount; 52 | 53 | F* func; 54 | 55 | Item(): counter(0), ready(0) 56 | { 57 | } 58 | 59 | void run(int worker) override 60 | { 61 | unsigned int groups = 0; 62 | 63 | for (;;) 64 | { 65 | unsigned int groupIndex = counter.fetch_add(1); 66 | if (groupIndex >= groupCount) break; 67 | 68 | unsigned int begin = groupIndex * groupSize; 69 | unsigned int end = std::min(count, begin + groupSize); 70 | 71 | for (unsigned int i = begin; i < end; ++i) 72 | (*func)(parallelForIndex(data, i), worker); 73 | 74 | groups++; 75 | } 76 | 77 | ready.fetch_add(groups); 78 | } 79 | }; 80 | 81 | auto item = std::make_shared(); 82 | 83 | item->queue = &queue; 84 | item->data = data; 85 | item->count = count; 86 | item->groupSize = groupSize; 87 | item->groupCount = (count + groupSize - 1) / groupSize; 88 | item->func = &func; 89 | 90 | int optimalWorkerCount = std::min(unsigned(queue.getWorkerCount()), item->groupCount - 1); 91 | 92 | queue.pushItem(item, optimalWorkerCount); 93 | 94 | item->run(queue.getWorkerCount()); 95 | 96 | if (item->ready.load() < item->groupCount) 97 | { 98 | MICROPROFILE_SCOPEI("WorkQueue", "Wait", 0xff0000); 99 | 100 | do std::this_thread::yield(); 101 | while (item->ready.load() < item->groupCount); 102 | } 103 | } -------------------------------------------------------------------------------- /src/base/RadixSort.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | inline unsigned int radixUnsignedInt(unsigned int v) 4 | { 5 | return v; 6 | } 7 | 8 | inline unsigned int radixInt(int v) 9 | { 10 | // flip sign bit 11 | return static_cast(v) ^ 0x80000000; 12 | } 13 | 14 | inline unsigned int radixUnsignedFloat(const float& v) 15 | { 16 | return *reinterpret_cast(&v); 17 | } 18 | 19 | inline unsigned int radixFloat(const float& v) 20 | { 21 | // if sign bit is 0, flip sign bit 22 | // if sign bit is 1, flip everything 23 | int f = *reinterpret_cast(&v); 24 | unsigned int mask = int(f >> 31) | 0x80000000; 25 | return f ^ mask; 26 | } 27 | 28 | template inline T* radixSort3(T* e0, T* e1, size_t count, Pred pred) 29 | { 30 | unsigned int h[2048*3]; 31 | 32 | for (size_t i = 0; i < 2048*3; ++i) h[i] = 0; 33 | 34 | unsigned int* h0 = h; 35 | unsigned int* h1 = h + 2048; 36 | unsigned int* h2 = h + 2048*2; 37 | 38 | T* e0_end = e0 + count; 39 | 40 | #define _0(h) ((h) & 2047) 41 | #define _1(h) (((h) >> 11) & 2047) 42 | #define _2(h) ((h) >> 22) 43 | 44 | // fill histogram 45 | for (const T* i = e0; i != e0_end; ++i) 46 | { 47 | unsigned int h = pred(*i); 48 | 49 | h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; 50 | } 51 | 52 | // compute offsets 53 | { 54 | unsigned int sum0 = 0, sum1 = 0, sum2 = 0; 55 | 56 | for (unsigned int i = 0; i < 2048; ++i) 57 | { 58 | unsigned int c0 = h0[i]; 59 | unsigned int c1 = h1[i]; 60 | unsigned int c2 = h2[i]; 61 | 62 | h0[i] = sum0; 63 | h1[i] = sum1; 64 | h2[i] = sum2; 65 | 66 | sum0 += c0; 67 | sum1 += c1; 68 | sum2 += c2; 69 | } 70 | } 71 | 72 | for (size_t i = 0; i < count; ++i) 73 | { 74 | unsigned int h = pred(e0[i]); 75 | e1[h0[_0(h)]++] = e0[i]; 76 | } 77 | 78 | for (size_t i = 0; i < count; ++i) 79 | { 80 | unsigned int h = pred(e1[i]); 81 | e0[h1[_1(h)]++] = e1[i]; 82 | } 83 | 84 | for (size_t i = 0; i < count; ++i) 85 | { 86 | unsigned int h = pred(e0[i]); 87 | e1[h2[_2(h)]++] = e0[i]; 88 | } 89 | 90 | #undef _0 91 | #undef _1 92 | #undef _2 93 | 94 | return e1; 95 | } 96 | 97 | template inline T* radixSort4(T* e0, T* e1, size_t count, Pred pred) 98 | { 99 | unsigned int h[256*4]; 100 | 101 | for (size_t i = 0; i < 256*4; ++i) h[i] = 0; 102 | 103 | unsigned int* h0 = h; 104 | unsigned int* h1 = h + 256; 105 | unsigned int* h2 = h + 256*2; 106 | unsigned int* h3 = h + 256*3; 107 | 108 | T* e0_end = e0 + count; 109 | 110 | #define _0(h) ((h) & 255) 111 | #define _1(h) (((h) >> 8) & 255) 112 | #define _2(h) (((h) >> 16) & 255) 113 | #define _3(h) ((h) >> 24) 114 | 115 | // fill histogram 116 | for (const T* i = e0; i != e0_end; ++i) 117 | { 118 | unsigned int h = pred(*i); 119 | 120 | h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; h3[_3(h)]++; 121 | } 122 | 123 | // compute offsets 124 | { 125 | unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; 126 | 127 | for (unsigned int i = 0; i < 256; ++i) 128 | { 129 | unsigned int c0 = h0[i]; 130 | unsigned int c1 = h1[i]; 131 | unsigned int c2 = h2[i]; 132 | unsigned int c3 = h3[i]; 133 | 134 | h0[i] = sum0; 135 | h1[i] = sum1; 136 | h2[i] = sum2; 137 | h3[i] = sum3; 138 | 139 | sum0 += c0; 140 | sum1 += c1; 141 | sum2 += c2; 142 | sum3 += c3; 143 | } 144 | } 145 | 146 | for (size_t i = 0; i < count; ++i) 147 | { 148 | unsigned int h = pred(e0[i]); 149 | e1[h0[_0(h)]++] = e0[i]; 150 | } 151 | 152 | for (size_t i = 0; i < count; ++i) 153 | { 154 | unsigned int h = pred(e1[i]); 155 | e0[h1[_1(h)]++] = e1[i]; 156 | } 157 | 158 | for (size_t i = 0; i < count; ++i) 159 | { 160 | unsigned int h = pred(e0[i]); 161 | e1[h2[_2(h)]++] = e0[i]; 162 | } 163 | 164 | for (size_t i = 0; i < count; ++i) 165 | { 166 | unsigned int h = pred(e1[i]); 167 | e0[h3[_3(h)]++] = e1[i]; 168 | } 169 | 170 | #undef _0 171 | #undef _1 172 | #undef _2 173 | #undef _3 174 | 175 | return e0; 176 | } -------------------------------------------------------------------------------- /src/base/SIMD.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #ifdef _MSC_VER 6 | #define SIMD_INLINE __forceinline 7 | #define SIMD_ALIGN(n) __declspec(align(n)) 8 | #else 9 | #define SIMD_INLINE __attribute__((always_inline)) inline 10 | #define SIMD_ALIGN(n) __attribute__((aligned(n))) 11 | #endif 12 | 13 | namespace simd 14 | { 15 | template struct VNf_; 16 | template struct VNi_; 17 | template struct VNb_; 18 | 19 | template using VNf = typename VNf_::type; 20 | template using VNi = typename VNi_::type; 21 | template using VNb = typename VNb_::type; 22 | 23 | template void dump(const char* name, const T& v) 24 | { 25 | printf("%s:", name); 26 | 27 | const float* fptr = reinterpret_cast(&v); 28 | const int* iptr = reinterpret_cast(&v); 29 | 30 | for (size_t i = 0; i < sizeof(v) / 4; ++i) 31 | printf(" %f [%08x]", fptr[i], iptr[i]); 32 | 33 | printf("\n"); 34 | } 35 | } 36 | 37 | #define SIMD_DUMP(v) simd::dump(#v, v) 38 | 39 | #include "SIMD_Scalar.h" 40 | 41 | #ifdef __SSE2__ 42 | #include "SIMD_SSE2.h" 43 | #endif 44 | 45 | #ifdef __AVX2__ 46 | #include "SIMD_AVX2.h" 47 | #endif -------------------------------------------------------------------------------- /src/base/SIMD_AVX2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "SIMD_AVX2_Transpose.h" 4 | 5 | namespace simd 6 | { 7 | struct V8f 8 | { 9 | __m256 v; 10 | 11 | SIMD_INLINE V8f() 12 | { 13 | } 14 | 15 | SIMD_INLINE V8f(__m256 v): v(v) 16 | { 17 | } 18 | 19 | SIMD_INLINE operator __m256() const 20 | { 21 | return v; 22 | } 23 | 24 | SIMD_INLINE static V8f zero() 25 | { 26 | return _mm256_setzero_ps(); 27 | } 28 | 29 | SIMD_INLINE static V8f one(float v) 30 | { 31 | return _mm256_set1_ps(v); 32 | } 33 | 34 | SIMD_INLINE static V8f sign() 35 | { 36 | return _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 37 | } 38 | 39 | SIMD_INLINE static V8f load(const float* ptr) 40 | { 41 | return _mm256_load_ps(ptr); 42 | } 43 | }; 44 | 45 | struct V8i 46 | { 47 | __m256i v; 48 | 49 | SIMD_INLINE V8i() 50 | { 51 | } 52 | 53 | SIMD_INLINE V8i(__m256i v): v(v) 54 | { 55 | } 56 | 57 | SIMD_INLINE operator __m256i() const 58 | { 59 | return v; 60 | } 61 | 62 | SIMD_INLINE static V8i zero() 63 | { 64 | return _mm256_setzero_si256(); 65 | } 66 | 67 | SIMD_INLINE static V8i one(int v) 68 | { 69 | return _mm256_set1_epi32(v); 70 | } 71 | 72 | SIMD_INLINE static V8i load(const int* ptr) 73 | { 74 | return _mm256_load_si256(reinterpret_cast(ptr)); 75 | } 76 | }; 77 | 78 | struct V8b 79 | { 80 | __m256 v; 81 | 82 | SIMD_INLINE V8b() 83 | { 84 | } 85 | 86 | SIMD_INLINE V8b(__m256 v): v(v) 87 | { 88 | } 89 | 90 | SIMD_INLINE V8b(__m256i v): v(_mm256_castsi256_ps(v)) 91 | { 92 | } 93 | 94 | SIMD_INLINE operator __m256() const 95 | { 96 | return v; 97 | } 98 | 99 | SIMD_INLINE static V8b zero() 100 | { 101 | return _mm256_setzero_ps(); 102 | } 103 | }; 104 | 105 | SIMD_INLINE V8i bitcast(V8f v) 106 | { 107 | return _mm256_castps_si256(v.v); 108 | } 109 | 110 | SIMD_INLINE V8f bitcast(V8i v) 111 | { 112 | return _mm256_castsi256_ps(v.v); 113 | } 114 | 115 | SIMD_INLINE V8f operator+(V8f v) 116 | { 117 | return v; 118 | } 119 | 120 | SIMD_INLINE V8f operator-(V8f v) 121 | { 122 | return _mm256_xor_ps(V8f::sign(), v.v); 123 | } 124 | 125 | SIMD_INLINE V8f operator+(V8f l, V8f r) 126 | { 127 | return _mm256_add_ps(l.v, r.v); 128 | } 129 | 130 | SIMD_INLINE V8f operator-(V8f l, V8f r) 131 | { 132 | return _mm256_sub_ps(l.v, r.v); 133 | } 134 | 135 | SIMD_INLINE V8f operator*(V8f l, V8f r) 136 | { 137 | return _mm256_mul_ps(l.v, r.v); 138 | } 139 | 140 | SIMD_INLINE V8f operator/(V8f l, V8f r) 141 | { 142 | return _mm256_div_ps(l.v, r.v); 143 | } 144 | 145 | SIMD_INLINE void operator+=(V8f& l, V8f r) 146 | { 147 | l.v = _mm256_add_ps(l.v, r.v); 148 | } 149 | 150 | SIMD_INLINE void operator-=(V8f& l, V8f r) 151 | { 152 | l.v = _mm256_sub_ps(l.v, r.v); 153 | } 154 | 155 | SIMD_INLINE void operator*=(V8f& l, V8f r) 156 | { 157 | l.v = _mm256_mul_ps(l.v, r.v); 158 | } 159 | 160 | SIMD_INLINE void operator/=(V8f& l, V8f r) 161 | { 162 | l.v = _mm256_div_ps(l.v, r.v); 163 | } 164 | 165 | SIMD_INLINE V8b operator==(V8f l, V8f r) 166 | { 167 | return _mm256_cmp_ps(l.v, r.v, _CMP_EQ_UQ); 168 | } 169 | 170 | SIMD_INLINE V8b operator==(V8i l, V8i r) 171 | { 172 | return _mm256_cmpeq_epi32(l.v, r.v); 173 | } 174 | 175 | SIMD_INLINE V8b operator!=(V8f l, V8f r) 176 | { 177 | return _mm256_cmp_ps(l.v, r.v, _CMP_NEQ_UQ); 178 | } 179 | 180 | SIMD_INLINE V8b operator!=(V8i l, V8i r) 181 | { 182 | return _mm256_xor_si256(_mm256_setzero_si256(), _mm256_cmpeq_epi32(l.v, r.v)); 183 | } 184 | 185 | SIMD_INLINE V8b operator<(V8f l, V8f r) 186 | { 187 | return _mm256_cmp_ps(l.v, r.v, _CMP_LT_OQ); 188 | } 189 | 190 | SIMD_INLINE V8b operator<(V8i l, V8i r) 191 | { 192 | return _mm256_cmpgt_epi32(r.v, l.v); 193 | } 194 | 195 | SIMD_INLINE V8b operator<=(V8f l, V8f r) 196 | { 197 | return _mm256_cmp_ps(l.v, r.v, _CMP_LE_OQ); 198 | } 199 | 200 | SIMD_INLINE V8b operator<=(V8i l, V8i r) 201 | { 202 | return _mm256_xor_si256(_mm256_setzero_si256(), _mm256_cmpgt_epi32(l.v, r.v)); 203 | } 204 | 205 | SIMD_INLINE V8b operator>(V8f l, V8f r) 206 | { 207 | return _mm256_cmp_ps(l.v, r.v, _CMP_GT_OQ); 208 | } 209 | 210 | SIMD_INLINE V8b operator>(V8i l, V8i r) 211 | { 212 | return _mm256_cmpgt_epi32(l.v, r.v); 213 | } 214 | 215 | SIMD_INLINE V8b operator>=(V8f l, V8f r) 216 | { 217 | return _mm256_cmp_ps(l.v, r.v, _CMP_GE_OQ); 218 | } 219 | 220 | SIMD_INLINE V8b operator>=(V8i l, V8i r) 221 | { 222 | return _mm256_xor_si256(_mm256_setzero_si256(), _mm256_cmpgt_epi32(r.v, l.v)); 223 | } 224 | 225 | SIMD_INLINE V8b operator!(V8b v) 226 | { 227 | return _mm256_xor_ps(_mm256_setzero_ps(), v.v); 228 | } 229 | 230 | SIMD_INLINE V8b operator&(V8b l, V8b r) 231 | { 232 | return _mm256_and_ps(l.v, r.v); 233 | } 234 | 235 | SIMD_INLINE V8b operator|(V8b l, V8b r) 236 | { 237 | return _mm256_or_ps(l.v, r.v); 238 | } 239 | 240 | SIMD_INLINE V8b operator^(V8b l, V8b r) 241 | { 242 | return _mm256_xor_ps(l.v, r.v); 243 | } 244 | 245 | SIMD_INLINE void operator&=(V8b& l, V8b r) 246 | { 247 | l.v = _mm256_and_ps(l.v, r.v); 248 | } 249 | 250 | SIMD_INLINE void operator|=(V8b& l, V8b r) 251 | { 252 | l.v = _mm256_or_ps(l.v, r.v); 253 | } 254 | 255 | SIMD_INLINE void operator^=(V8b& l, V8b r) 256 | { 257 | l.v = _mm256_xor_ps(l.v, r.v); 258 | } 259 | 260 | SIMD_INLINE V8f abs(V8f v) 261 | { 262 | return _mm256_andnot_ps(V8f::sign(), v.v); 263 | } 264 | 265 | SIMD_INLINE V8f copysign(V8f x, V8f y) 266 | { 267 | V8f sign = V8f::sign(); 268 | 269 | return _mm256_or_ps(_mm256_andnot_ps(sign.v, x.v), _mm256_and_ps(y.v, sign.v)); 270 | } 271 | 272 | SIMD_INLINE V8f flipsign(V8f x, V8f y) 273 | { 274 | return _mm256_xor_ps(x.v, _mm256_and_ps(y.v, V8f::sign())); 275 | } 276 | 277 | SIMD_INLINE V8f min(V8f l, V8f r) 278 | { 279 | return _mm256_min_ps(l.v, r.v); 280 | } 281 | 282 | SIMD_INLINE V8f max(V8f l, V8f r) 283 | { 284 | return _mm256_max_ps(l.v, r.v); 285 | } 286 | 287 | SIMD_INLINE V8f select(V8f l, V8f r, V8b m) 288 | { 289 | return _mm256_blendv_ps(l.v, r.v, m.v); 290 | } 291 | 292 | SIMD_INLINE V8i select(V8i l, V8i r, V8b m) 293 | { 294 | __m256i mi = _mm256_castps_si256(m.v); 295 | 296 | return _mm256_blendv_epi8(l.v, r.v, mi); 297 | } 298 | 299 | SIMD_INLINE bool none(V8b v) 300 | { 301 | return _mm256_movemask_ps(v.v) == 0; 302 | } 303 | 304 | SIMD_INLINE bool any(V8b v) 305 | { 306 | return _mm256_movemask_ps(v.v) != 0; 307 | } 308 | 309 | SIMD_INLINE bool all(V8b v) 310 | { 311 | return _mm256_movemask_ps(v.v) == 31; 312 | } 313 | 314 | SIMD_INLINE void store(V8f v, float* ptr) 315 | { 316 | _mm256_store_ps(ptr, v.v); 317 | } 318 | 319 | SIMD_INLINE void store(V8i v, int* ptr) 320 | { 321 | _mm256_store_si256(reinterpret_cast<__m256i*>(ptr), v.v); 322 | } 323 | 324 | SIMD_INLINE void loadindexed4(V8f& v0, V8f& v1, V8f& v2, V8f& v3, const void* base, const int indices[8], unsigned int stride) 325 | { 326 | const char* ptr = static_cast(base); 327 | 328 | __m128 hr0 = _mm_load_ps(reinterpret_cast(ptr + indices[0] * stride)); 329 | __m128 hr1 = _mm_load_ps(reinterpret_cast(ptr + indices[1] * stride)); 330 | __m128 hr2 = _mm_load_ps(reinterpret_cast(ptr + indices[2] * stride)); 331 | __m128 hr3 = _mm_load_ps(reinterpret_cast(ptr + indices[3] * stride)); 332 | __m128 hr4 = _mm_load_ps(reinterpret_cast(ptr + indices[4] * stride)); 333 | __m128 hr5 = _mm_load_ps(reinterpret_cast(ptr + indices[5] * stride)); 334 | __m128 hr6 = _mm_load_ps(reinterpret_cast(ptr + indices[6] * stride)); 335 | __m128 hr7 = _mm_load_ps(reinterpret_cast(ptr + indices[7] * stride)); 336 | 337 | __m256 r0 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr0), hr4, 1); 338 | __m256 r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr1), hr5, 1); 339 | __m256 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr2), hr6, 1); 340 | __m256 r3 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr3), hr7, 1); 341 | 342 | _MM_TRANSPOSE8_LANE4_PS(r0, r1, r2, r3); 343 | 344 | v0.v = r0; 345 | v1.v = r1; 346 | v2.v = r2; 347 | v3.v = r3; 348 | } 349 | 350 | SIMD_INLINE void storeindexed4(const V8f& v0, const V8f& v1, const V8f& v2, const V8f& v3, void* base, const int indices[8], unsigned int stride) 351 | { 352 | char* ptr = static_cast(base); 353 | 354 | __m256 r0 = v0.v; 355 | __m256 r1 = v1.v; 356 | __m256 r2 = v2.v; 357 | __m256 r3 = v3.v; 358 | 359 | _MM_TRANSPOSE8_LANE4_PS(r0, r1, r2, r3); 360 | 361 | __m128 hr0 = _mm256_castps256_ps128(r0); 362 | __m128 hr1 = _mm256_castps256_ps128(r1); 363 | __m128 hr2 = _mm256_castps256_ps128(r2); 364 | __m128 hr3 = _mm256_castps256_ps128(r3); 365 | __m128 hr4 = _mm256_extractf128_ps(r0, 1); 366 | __m128 hr5 = _mm256_extractf128_ps(r1, 1); 367 | __m128 hr6 = _mm256_extractf128_ps(r2, 1); 368 | __m128 hr7 = _mm256_extractf128_ps(r3, 1); 369 | 370 | _mm_store_ps(reinterpret_cast(ptr + indices[0] * stride), hr0); 371 | _mm_store_ps(reinterpret_cast(ptr + indices[1] * stride), hr1); 372 | _mm_store_ps(reinterpret_cast(ptr + indices[2] * stride), hr2); 373 | _mm_store_ps(reinterpret_cast(ptr + indices[3] * stride), hr3); 374 | _mm_store_ps(reinterpret_cast(ptr + indices[4] * stride), hr4); 375 | _mm_store_ps(reinterpret_cast(ptr + indices[5] * stride), hr5); 376 | _mm_store_ps(reinterpret_cast(ptr + indices[6] * stride), hr6); 377 | _mm_store_ps(reinterpret_cast(ptr + indices[7] * stride), hr7); 378 | } 379 | 380 | SIMD_INLINE void loadindexed8(V8f& v0, V8f& v1, V8f& v2, V8f& v3, V8f& v4, V8f& v5, V8f& v6, V8f& v7, const void* base, const int indices[8], unsigned int stride) 381 | { 382 | const char* ptr = static_cast(base); 383 | 384 | __m256 r0 = _mm256_load_ps(reinterpret_cast(ptr + indices[0] * stride)); 385 | __m256 r1 = _mm256_load_ps(reinterpret_cast(ptr + indices[1] * stride)); 386 | __m256 r2 = _mm256_load_ps(reinterpret_cast(ptr + indices[2] * stride)); 387 | __m256 r3 = _mm256_load_ps(reinterpret_cast(ptr + indices[3] * stride)); 388 | __m256 r4 = _mm256_load_ps(reinterpret_cast(ptr + indices[4] * stride)); 389 | __m256 r5 = _mm256_load_ps(reinterpret_cast(ptr + indices[5] * stride)); 390 | __m256 r6 = _mm256_load_ps(reinterpret_cast(ptr + indices[6] * stride)); 391 | __m256 r7 = _mm256_load_ps(reinterpret_cast(ptr + indices[7] * stride)); 392 | 393 | _MM_TRANSPOSE8_PS(r0, r1, r2, r3, r4, r5, r6, r7); 394 | 395 | v0.v = r0; 396 | v1.v = r1; 397 | v2.v = r2; 398 | v3.v = r3; 399 | v4.v = r4; 400 | v5.v = r5; 401 | v6.v = r6; 402 | v7.v = r7; 403 | } 404 | } 405 | 406 | namespace simd 407 | { 408 | template <> struct VNf_<8> { typedef V8f type; }; 409 | template <> struct VNi_<8> { typedef V8i type; }; 410 | template <> struct VNb_<8> { typedef V8b type; }; 411 | } 412 | 413 | using simd::V8f; 414 | using simd::V8i; 415 | using simd::V8b; -------------------------------------------------------------------------------- /src/base/SIMD_AVX2_Transpose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Transpose 4x4 blocks within each lane 4 | #define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \ 5 | do { \ 6 | __m256 __t0, __t1, __t2, __t3; \ 7 | __t0 = _mm256_unpacklo_ps(row0, row1); \ 8 | __t1 = _mm256_unpackhi_ps(row0, row1); \ 9 | __t2 = _mm256_unpacklo_ps(row2, row3); \ 10 | __t3 = _mm256_unpackhi_ps(row2, row3); \ 11 | row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \ 12 | row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \ 13 | row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \ 14 | row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \ 15 | } while (0) 16 | 17 | // http://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 18 | #define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \ 19 | do { \ 20 | __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \ 21 | __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \ 22 | __t0 = _mm256_unpacklo_ps(row0, row1); \ 23 | __t1 = _mm256_unpackhi_ps(row0, row1); \ 24 | __t2 = _mm256_unpacklo_ps(row2, row3); \ 25 | __t3 = _mm256_unpackhi_ps(row2, row3); \ 26 | __t4 = _mm256_unpacklo_ps(row4, row5); \ 27 | __t5 = _mm256_unpackhi_ps(row4, row5); \ 28 | __t6 = _mm256_unpacklo_ps(row6, row7); \ 29 | __t7 = _mm256_unpackhi_ps(row6, row7); \ 30 | __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \ 31 | __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \ 32 | __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \ 33 | __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \ 34 | __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \ 35 | __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \ 36 | __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \ 37 | __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \ 38 | row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \ 39 | row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \ 40 | row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \ 41 | row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \ 42 | row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \ 43 | row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \ 44 | row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \ 45 | row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \ 46 | } while (0) 47 | -------------------------------------------------------------------------------- /src/base/SIMD_SSE2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace simd 4 | { 5 | struct V4f 6 | { 7 | __m128 v; 8 | 9 | SIMD_INLINE V4f() 10 | { 11 | } 12 | 13 | SIMD_INLINE V4f(__m128 v): v(v) 14 | { 15 | } 16 | 17 | SIMD_INLINE operator __m128() const 18 | { 19 | return v; 20 | } 21 | 22 | SIMD_INLINE static V4f zero() 23 | { 24 | return _mm_setzero_ps(); 25 | } 26 | 27 | SIMD_INLINE static V4f one(float v) 28 | { 29 | return _mm_set1_ps(v); 30 | } 31 | 32 | SIMD_INLINE static V4f sign() 33 | { 34 | return _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); 35 | } 36 | 37 | SIMD_INLINE static V4f load(const float* ptr) 38 | { 39 | return _mm_load_ps(ptr); 40 | } 41 | }; 42 | 43 | struct V4i 44 | { 45 | __m128i v; 46 | 47 | SIMD_INLINE V4i() 48 | { 49 | } 50 | 51 | SIMD_INLINE V4i(__m128i v): v(v) 52 | { 53 | } 54 | 55 | SIMD_INLINE operator __m128i() const 56 | { 57 | return v; 58 | } 59 | 60 | SIMD_INLINE static V4i zero() 61 | { 62 | return _mm_setzero_si128(); 63 | } 64 | 65 | SIMD_INLINE static V4i one(int v) 66 | { 67 | return _mm_set1_epi32(v); 68 | } 69 | 70 | SIMD_INLINE static V4i load(const int* ptr) 71 | { 72 | return _mm_load_si128(reinterpret_cast(ptr)); 73 | } 74 | }; 75 | 76 | struct V4b 77 | { 78 | __m128 v; 79 | 80 | SIMD_INLINE V4b() 81 | { 82 | } 83 | 84 | SIMD_INLINE V4b(__m128 v): v(v) 85 | { 86 | } 87 | 88 | SIMD_INLINE V4b(__m128i v): v(_mm_castsi128_ps(v)) 89 | { 90 | } 91 | 92 | SIMD_INLINE operator __m128() const 93 | { 94 | return v; 95 | } 96 | 97 | SIMD_INLINE static V4b zero() 98 | { 99 | return _mm_setzero_ps(); 100 | } 101 | }; 102 | 103 | SIMD_INLINE V4f bitcast(V4i v) 104 | { 105 | return _mm_castsi128_ps(v.v); 106 | } 107 | 108 | SIMD_INLINE V4i bitcast(V4f v) 109 | { 110 | return _mm_castps_si128(v.v); 111 | } 112 | 113 | SIMD_INLINE V4f operator+(V4f v) 114 | { 115 | return v; 116 | } 117 | 118 | SIMD_INLINE V4f operator-(V4f v) 119 | { 120 | return _mm_xor_ps(V4f::sign(), v.v); 121 | } 122 | 123 | SIMD_INLINE V4f operator+(V4f l, V4f r) 124 | { 125 | return _mm_add_ps(l.v, r.v); 126 | } 127 | 128 | SIMD_INLINE V4f operator-(V4f l, V4f r) 129 | { 130 | return _mm_sub_ps(l.v, r.v); 131 | } 132 | 133 | SIMD_INLINE V4f operator*(V4f l, V4f r) 134 | { 135 | return _mm_mul_ps(l.v, r.v); 136 | } 137 | 138 | SIMD_INLINE V4f operator/(V4f l, V4f r) 139 | { 140 | return _mm_div_ps(l.v, r.v); 141 | } 142 | 143 | SIMD_INLINE void operator+=(V4f& l, V4f r) 144 | { 145 | l.v = _mm_add_ps(l.v, r.v); 146 | } 147 | 148 | SIMD_INLINE void operator-=(V4f& l, V4f r) 149 | { 150 | l.v = _mm_sub_ps(l.v, r.v); 151 | } 152 | 153 | SIMD_INLINE void operator*=(V4f& l, V4f r) 154 | { 155 | l.v = _mm_mul_ps(l.v, r.v); 156 | } 157 | 158 | SIMD_INLINE void operator/=(V4f& l, V4f r) 159 | { 160 | l.v = _mm_div_ps(l.v, r.v); 161 | } 162 | 163 | SIMD_INLINE V4b operator==(V4f l, V4f r) 164 | { 165 | return _mm_cmpeq_ps(l.v, r.v); 166 | } 167 | 168 | SIMD_INLINE V4b operator==(V4i l, V4i r) 169 | { 170 | return _mm_cmpeq_epi32(l.v, r.v); 171 | } 172 | 173 | SIMD_INLINE V4b operator!=(V4f l, V4f r) 174 | { 175 | return _mm_cmpneq_ps(l.v, r.v); 176 | } 177 | 178 | SIMD_INLINE V4b operator!=(V4i l, V4i r) 179 | { 180 | return _mm_xor_si128(_mm_setzero_si128(), _mm_cmpeq_epi32(l.v, r.v)); 181 | } 182 | 183 | SIMD_INLINE V4b operator<(V4f l, V4f r) 184 | { 185 | return _mm_cmplt_ps(l.v, r.v); 186 | } 187 | 188 | SIMD_INLINE V4b operator<(V4i l, V4i r) 189 | { 190 | return _mm_cmplt_epi32(l.v, r.v); 191 | } 192 | 193 | SIMD_INLINE V4b operator<=(V4f l, V4f r) 194 | { 195 | return _mm_cmple_ps(l.v, r.v); 196 | } 197 | 198 | SIMD_INLINE V4b operator<=(V4i l, V4i r) 199 | { 200 | return _mm_xor_si128(_mm_setzero_si128(), _mm_cmplt_epi32(r.v, l.v)); 201 | } 202 | 203 | SIMD_INLINE V4b operator>(V4f l, V4f r) 204 | { 205 | return _mm_cmpgt_ps(l.v, r.v); 206 | } 207 | 208 | SIMD_INLINE V4b operator>(V4i l, V4i r) 209 | { 210 | return _mm_cmpgt_epi32(l.v, r.v); 211 | } 212 | 213 | SIMD_INLINE V4b operator>=(V4f l, V4f r) 214 | { 215 | return _mm_cmpge_ps(l.v, r.v); 216 | } 217 | 218 | SIMD_INLINE V4b operator>=(V4i l, V4i r) 219 | { 220 | return _mm_xor_si128(_mm_setzero_si128(), _mm_cmplt_epi32(l.v, r.v)); 221 | } 222 | 223 | SIMD_INLINE V4b operator!(V4b v) 224 | { 225 | return _mm_xor_ps(_mm_setzero_ps(), v.v); 226 | } 227 | 228 | SIMD_INLINE V4b operator&(V4b l, V4b r) 229 | { 230 | return _mm_and_ps(l.v, r.v); 231 | } 232 | 233 | SIMD_INLINE V4b operator|(V4b l, V4b r) 234 | { 235 | return _mm_or_ps(l.v, r.v); 236 | } 237 | 238 | SIMD_INLINE V4b operator^(V4b l, V4b r) 239 | { 240 | return _mm_xor_ps(l.v, r.v); 241 | } 242 | 243 | SIMD_INLINE void operator&=(V4b& l, V4b r) 244 | { 245 | l.v = _mm_and_ps(l.v, r.v); 246 | } 247 | 248 | SIMD_INLINE void operator|=(V4b& l, V4b r) 249 | { 250 | l.v = _mm_or_ps(l.v, r.v); 251 | } 252 | 253 | SIMD_INLINE void operator^=(V4b l, V4b r) 254 | { 255 | l.v = _mm_xor_ps(l.v, r.v); 256 | } 257 | 258 | SIMD_INLINE V4f abs(V4f v) 259 | { 260 | return _mm_andnot_ps(V4f::sign(), v.v); 261 | } 262 | 263 | SIMD_INLINE V4f copysign(V4f x, V4f y) 264 | { 265 | V4f sign = V4f::sign(); 266 | 267 | return _mm_or_ps(_mm_andnot_ps(sign.v, x.v), _mm_and_ps(y.v, sign.v)); 268 | } 269 | 270 | SIMD_INLINE V4f flipsign(V4f x, V4f y) 271 | { 272 | return _mm_xor_ps(x.v, _mm_and_ps(y.v, V4f::sign())); 273 | } 274 | 275 | SIMD_INLINE V4f min(V4f l, V4f r) 276 | { 277 | return _mm_min_ps(l.v, r.v); 278 | } 279 | 280 | SIMD_INLINE V4f max(V4f l, V4f r) 281 | { 282 | return _mm_max_ps(l.v, r.v); 283 | } 284 | 285 | SIMD_INLINE V4f select(V4f l, V4f r, V4b m) 286 | { 287 | return _mm_or_ps(_mm_andnot_ps(m.v, l.v), _mm_and_ps(r.v, m.v)); 288 | } 289 | 290 | SIMD_INLINE V4i select(V4i l, V4i r, V4b m) 291 | { 292 | __m128i mi = _mm_castps_si128(m.v); 293 | 294 | return _mm_or_si128(_mm_andnot_si128(mi, l.v), _mm_and_si128(r.v, mi)); 295 | } 296 | 297 | SIMD_INLINE bool none(V4b v) 298 | { 299 | return _mm_movemask_ps(v.v) == 0; 300 | } 301 | 302 | SIMD_INLINE bool any(V4b v) 303 | { 304 | return _mm_movemask_ps(v.v) != 0; 305 | } 306 | 307 | SIMD_INLINE bool all(V4b v) 308 | { 309 | return _mm_movemask_ps(v.v) == 15; 310 | } 311 | 312 | SIMD_INLINE void store(V4f v, float* ptr) 313 | { 314 | _mm_store_ps(ptr, v.v); 315 | } 316 | 317 | SIMD_INLINE void store(V4i v, int* ptr) 318 | { 319 | _mm_store_si128(reinterpret_cast<__m128i*>(ptr), v.v); 320 | } 321 | 322 | SIMD_INLINE void loadindexed4(V4f& v0, V4f& v1, V4f& v2, V4f& v3, const void* base, const int indices[4], unsigned int stride) 323 | { 324 | const char* ptr = static_cast(base); 325 | 326 | __m128 r0 = _mm_load_ps(reinterpret_cast(ptr + indices[0] * stride)); 327 | __m128 r1 = _mm_load_ps(reinterpret_cast(ptr + indices[1] * stride)); 328 | __m128 r2 = _mm_load_ps(reinterpret_cast(ptr + indices[2] * stride)); 329 | __m128 r3 = _mm_load_ps(reinterpret_cast(ptr + indices[3] * stride)); 330 | 331 | _MM_TRANSPOSE4_PS(r0, r1, r2, r3); 332 | 333 | v0.v = r0; 334 | v1.v = r1; 335 | v2.v = r2; 336 | v3.v = r3; 337 | } 338 | 339 | SIMD_INLINE void storeindexed4(const V4f& v0, const V4f& v1, const V4f& v2, const V4f& v3, void* base, const int indices[4], unsigned int stride) 340 | { 341 | char* ptr = static_cast(base); 342 | 343 | __m128 r0 = v0.v; 344 | __m128 r1 = v1.v; 345 | __m128 r2 = v2.v; 346 | __m128 r3 = v3.v; 347 | 348 | _MM_TRANSPOSE4_PS(r0, r1, r2, r3); 349 | 350 | _mm_store_ps(reinterpret_cast(ptr + indices[0] * stride), r0); 351 | _mm_store_ps(reinterpret_cast(ptr + indices[1] * stride), r1); 352 | _mm_store_ps(reinterpret_cast(ptr + indices[2] * stride), r2); 353 | _mm_store_ps(reinterpret_cast(ptr + indices[3] * stride), r3); 354 | } 355 | 356 | SIMD_INLINE void loadindexed8(V4f& v0, V4f& v1, V4f& v2, V4f& v3, V4f& v4, V4f& v5, V4f& v6, V4f& v7, const void* base, const int indices[4], unsigned int stride) 357 | { 358 | const char* ptr = static_cast(base); 359 | 360 | const float* ptr0 = reinterpret_cast(ptr + indices[0] * stride); 361 | const float* ptr1 = reinterpret_cast(ptr + indices[1] * stride); 362 | const float* ptr2 = reinterpret_cast(ptr + indices[2] * stride); 363 | const float* ptr3 = reinterpret_cast(ptr + indices[3] * stride); 364 | 365 | __m128 r0 = _mm_load_ps(ptr0 + 0); 366 | __m128 r1 = _mm_load_ps(ptr1 + 0); 367 | __m128 r2 = _mm_load_ps(ptr2 + 0); 368 | __m128 r3 = _mm_load_ps(ptr3 + 0); 369 | __m128 r4 = _mm_load_ps(ptr0 + 4); 370 | __m128 r5 = _mm_load_ps(ptr1 + 4); 371 | __m128 r6 = _mm_load_ps(ptr2 + 4); 372 | __m128 r7 = _mm_load_ps(ptr3 + 4); 373 | 374 | _MM_TRANSPOSE4_PS(r0, r1, r2, r3); 375 | _MM_TRANSPOSE4_PS(r4, r5, r6, r7); 376 | 377 | v0.v = r0; 378 | v1.v = r1; 379 | v2.v = r2; 380 | v3.v = r3; 381 | v4.v = r4; 382 | v5.v = r5; 383 | v6.v = r6; 384 | v7.v = r7; 385 | } 386 | } 387 | 388 | namespace simd 389 | { 390 | template <> struct VNf_<4> { typedef V4f type; }; 391 | template <> struct VNi_<4> { typedef V4i type; }; 392 | template <> struct VNb_<4> { typedef V4b type; }; 393 | } 394 | 395 | using simd::V4f; 396 | using simd::V4i; 397 | using simd::V4b; 398 | -------------------------------------------------------------------------------- /src/base/SIMD_Scalar.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace simd 6 | { 7 | struct V1f 8 | { 9 | float v; 10 | 11 | SIMD_INLINE V1f() 12 | { 13 | } 14 | 15 | SIMD_INLINE V1f(float v): v(v) 16 | { 17 | } 18 | 19 | SIMD_INLINE operator float() const 20 | { 21 | return v; 22 | } 23 | 24 | SIMD_INLINE static V1f zero() 25 | { 26 | return 0.f; 27 | } 28 | 29 | SIMD_INLINE static V1f one(float v) 30 | { 31 | return v; 32 | } 33 | 34 | SIMD_INLINE static V1f load(const float* ptr) 35 | { 36 | return *ptr; 37 | } 38 | }; 39 | 40 | struct V1i 41 | { 42 | int v; 43 | 44 | SIMD_INLINE V1i() 45 | { 46 | } 47 | 48 | SIMD_INLINE V1i(int v): v(v) 49 | { 50 | } 51 | 52 | SIMD_INLINE operator int() const 53 | { 54 | return v; 55 | } 56 | 57 | SIMD_INLINE static V1i zero() 58 | { 59 | return 0; 60 | } 61 | 62 | SIMD_INLINE static V1i one(int v) 63 | { 64 | return v; 65 | } 66 | 67 | SIMD_INLINE static V1i load(const int* ptr) 68 | { 69 | return *ptr; 70 | } 71 | }; 72 | 73 | struct V1b 74 | { 75 | bool v; 76 | 77 | SIMD_INLINE V1b() 78 | { 79 | } 80 | 81 | SIMD_INLINE V1b(bool v): v(v) 82 | { 83 | } 84 | 85 | SIMD_INLINE operator bool() const 86 | { 87 | return v; 88 | } 89 | 90 | SIMD_INLINE static V1b zero() 91 | { 92 | return false; 93 | } 94 | }; 95 | 96 | SIMD_INLINE V1f bitcast(const V1i& v) 97 | { 98 | union { float f; int i; } u; 99 | u.i = v.v; 100 | return u.f; 101 | } 102 | 103 | SIMD_INLINE V1i bitcast(const V1f& v) 104 | { 105 | union { float f; int i; } u; 106 | u.f = v.v; 107 | return u.i; 108 | } 109 | 110 | SIMD_INLINE V1f operator+(V1f v) 111 | { 112 | return v; 113 | } 114 | 115 | SIMD_INLINE V1f operator-(V1f v) 116 | { 117 | return -v.v; 118 | } 119 | 120 | SIMD_INLINE V1f operator+(V1f l, V1f r) 121 | { 122 | return l.v + r.v; 123 | } 124 | 125 | SIMD_INLINE V1f operator-(V1f l, V1f r) 126 | { 127 | return l.v - r.v; 128 | } 129 | 130 | SIMD_INLINE V1f operator*(V1f l, V1f r) 131 | { 132 | return l.v * r.v; 133 | } 134 | 135 | SIMD_INLINE V1f operator/(V1f l, V1f r) 136 | { 137 | return l.v / r.v; 138 | } 139 | 140 | SIMD_INLINE void operator+=(V1f& l, V1f r) 141 | { 142 | l.v += r.v; 143 | } 144 | 145 | SIMD_INLINE void operator-=(V1f& l, V1f r) 146 | { 147 | l.v -= r.v; 148 | } 149 | 150 | SIMD_INLINE void operator*=(V1f& l, V1f r) 151 | { 152 | l.v *= r.v; 153 | } 154 | 155 | SIMD_INLINE void operator/=(V1f& l, V1f r) 156 | { 157 | l.v /= r.v; 158 | } 159 | 160 | SIMD_INLINE V1b operator==(V1f l, V1f r) 161 | { 162 | return l.v == r.v; 163 | } 164 | 165 | SIMD_INLINE V1b operator==(V1i l, V1i r) 166 | { 167 | return l.v == r.v; 168 | } 169 | 170 | SIMD_INLINE V1b operator!=(V1f l, V1f r) 171 | { 172 | return l.v != r.v; 173 | } 174 | 175 | SIMD_INLINE V1b operator!=(V1i l, V1i r) 176 | { 177 | return l.v != r.v; 178 | } 179 | 180 | SIMD_INLINE V1b operator<(V1f l, V1f r) 181 | { 182 | return l.v < r.v; 183 | } 184 | 185 | SIMD_INLINE V1b operator<(V1i l, V1i r) 186 | { 187 | return l.v < r.v; 188 | } 189 | 190 | SIMD_INLINE V1b operator<=(V1f l, V1f r) 191 | { 192 | return l.v <= r.v; 193 | } 194 | 195 | SIMD_INLINE V1b operator<=(V1i l, V1i r) 196 | { 197 | return l.v <= r.v; 198 | } 199 | 200 | SIMD_INLINE V1b operator>(V1f l, V1f r) 201 | { 202 | return l.v > r.v; 203 | } 204 | 205 | SIMD_INLINE V1b operator>(V1i l, V1i r) 206 | { 207 | return l.v > r.v; 208 | } 209 | 210 | SIMD_INLINE V1b operator>=(V1f l, V1f r) 211 | { 212 | return l.v >= r.v; 213 | } 214 | 215 | SIMD_INLINE V1b operator>=(V1i l, V1i r) 216 | { 217 | return l.v >= r.v; 218 | } 219 | 220 | SIMD_INLINE V1b operator!(V1b v) 221 | { 222 | return !v.v; 223 | } 224 | 225 | SIMD_INLINE V1b operator&(V1b l, V1b r) 226 | { 227 | return l.v & r.v; 228 | } 229 | 230 | SIMD_INLINE V1b operator|(V1b l, V1b r) 231 | { 232 | return l.v | r.v; 233 | } 234 | 235 | SIMD_INLINE V1b operator^(V1b l, V1b r) 236 | { 237 | return l.v ^ r.v; 238 | } 239 | 240 | SIMD_INLINE void operator&=(V1b& l, V1b r) 241 | { 242 | l.v &= r.v; 243 | } 244 | 245 | SIMD_INLINE void operator|=(V1b& l, V1b r) 246 | { 247 | l.v |= r.v; 248 | } 249 | 250 | SIMD_INLINE void operator^=(V1b l, V1b r) 251 | { 252 | l.v ^= r.v; 253 | } 254 | 255 | SIMD_INLINE V1f abs(V1f v) 256 | { 257 | return fabsf(v.v); 258 | } 259 | 260 | SIMD_INLINE V1f copysign(V1f x, V1f y) 261 | { 262 | return copysignf(x, y); 263 | } 264 | 265 | SIMD_INLINE V1f flipsign(V1f x, V1f y) 266 | { 267 | return y.v < 0.f ? -x.v : x.v; 268 | } 269 | 270 | SIMD_INLINE V1f min(V1f l, V1f r) 271 | { 272 | return l.v < r.v ? l.v : r.v; 273 | } 274 | 275 | SIMD_INLINE V1f max(V1f l, V1f r) 276 | { 277 | return l.v > r.v ? l.v : r.v; 278 | } 279 | 280 | SIMD_INLINE V1f select(V1f l, V1f r, V1b m) 281 | { 282 | return m.v ? r.v : l.v; 283 | } 284 | 285 | SIMD_INLINE V1i select(V1i l, V1i r, V1b m) 286 | { 287 | return m.v ? r.v : l.v; 288 | } 289 | 290 | SIMD_INLINE bool none(V1b v) 291 | { 292 | return !v.v; 293 | } 294 | 295 | SIMD_INLINE bool any(V1b v) 296 | { 297 | return v.v; 298 | } 299 | 300 | SIMD_INLINE bool all(V1b v) 301 | { 302 | return v.v; 303 | } 304 | 305 | SIMD_INLINE void store(V1f v, float* ptr) 306 | { 307 | *ptr = v.v; 308 | } 309 | 310 | SIMD_INLINE void store(V1i v, int* ptr) 311 | { 312 | *ptr = v.v; 313 | } 314 | 315 | SIMD_INLINE void loadindexed4(V1f& v0, V1f& v1, V1f& v2, V1f& v3, const void* base, const int indices[1], unsigned int stride) 316 | { 317 | const float* ptr = reinterpret_cast(static_cast(base) + indices[0] * stride); 318 | 319 | v0.v = ptr[0]; 320 | v1.v = ptr[1]; 321 | v2.v = ptr[2]; 322 | v3.v = ptr[3]; 323 | } 324 | 325 | SIMD_INLINE void storeindexed4(const V1f& v0, const V1f& v1, const V1f& v2, const V1f& v3, void* base, const int indices[1], unsigned int stride) 326 | { 327 | float* ptr = reinterpret_cast(static_cast(base) + indices[0] * stride); 328 | 329 | ptr[0] = v0.v; 330 | ptr[1] = v1.v; 331 | ptr[2] = v2.v; 332 | ptr[3] = v3.v; 333 | } 334 | 335 | SIMD_INLINE void loadindexed8(V1f& v0, V1f& v1, V1f& v2, V1f& v3, V1f& v4, V1f& v5, V1f& v6, V1f& v7, const void* base, const int indices[1], unsigned int stride) 336 | { 337 | const float* ptr = reinterpret_cast(static_cast(base) + indices[0] * stride); 338 | 339 | v0.v = ptr[0]; 340 | v1.v = ptr[1]; 341 | v2.v = ptr[2]; 342 | v3.v = ptr[3]; 343 | v4.v = ptr[4]; 344 | v5.v = ptr[5]; 345 | v6.v = ptr[6]; 346 | v7.v = ptr[7]; 347 | } 348 | } 349 | 350 | namespace simd 351 | { 352 | template <> struct VNf_<1> { typedef V1f type; }; 353 | template <> struct VNi_<1> { typedef V1i type; }; 354 | template <> struct VNb_<1> { typedef V1b type; }; 355 | } 356 | 357 | using simd::V1f; 358 | using simd::V1i; 359 | using simd::V1b; 360 | -------------------------------------------------------------------------------- /src/base/WorkQueue.cpp: -------------------------------------------------------------------------------- 1 | #include "WorkQueue.h" 2 | 3 | #include "microprofile.h" 4 | 5 | unsigned int WorkQueue::getIdealWorkerCount() 6 | { 7 | return std::max(std::thread::hardware_concurrency(), 1u); 8 | } 9 | 10 | WorkQueue::WorkQueue(unsigned int workerCount) 11 | { 12 | for (unsigned int i = 0; i < workerCount; ++i) 13 | workers.emplace_back(workerThreadFun, this, i); 14 | } 15 | 16 | WorkQueue::~WorkQueue() 17 | { 18 | pushItem(std::shared_ptr(), workers.size()); 19 | 20 | for (unsigned int i = 0; i < workers.size(); ++i) 21 | workers[i].join(); 22 | } 23 | 24 | void WorkQueue::pushItem(std::shared_ptr item, int count) 25 | { 26 | MICROPROFILE_SCOPEI("WorkQueue", "Push", -1); 27 | 28 | std::unique_lock lock(itemsMutex); 29 | 30 | items.push(std::make_pair(std::move(item), count)); 31 | 32 | lock.unlock(); 33 | 34 | MICROPROFILE_SCOPEI("WorkQueue", "Notify", -1); 35 | 36 | if (count > 1) 37 | itemsCondition.notify_all(); 38 | else 39 | itemsCondition.notify_one(); 40 | } 41 | 42 | void WorkQueue::workerThreadFun(WorkQueue* queue, int worker) 43 | { 44 | char name[16]; 45 | sprintf(name, "Worker %d", worker); 46 | MicroProfileOnThreadCreate(name); 47 | 48 | for (;;) 49 | { 50 | std::shared_ptr item; 51 | 52 | { 53 | std::unique_lock lock(queue->itemsMutex); 54 | 55 | queue->itemsCondition.wait(lock, [&]() { return !queue->items.empty(); }); 56 | 57 | auto& slot = queue->items.front(); 58 | 59 | if (slot.second <= 1) 60 | { 61 | item = std::move(slot.first); 62 | queue->items.pop(); 63 | } 64 | else 65 | { 66 | item = slot.first; 67 | slot.second--; 68 | } 69 | } 70 | 71 | if (!item) break; 72 | 73 | MICROPROFILE_SCOPEI("WorkQueue", "JobRun", 0x606060); 74 | 75 | item->run(worker); 76 | } 77 | 78 | MicroProfileOnThreadExit(); 79 | } -------------------------------------------------------------------------------- /src/base/WorkQueue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | class WorkQueue 11 | { 12 | public: 13 | struct Item 14 | { 15 | virtual ~Item() {} 16 | 17 | virtual void run(int worker) = 0; 18 | }; 19 | 20 | static unsigned int getIdealWorkerCount(); 21 | 22 | WorkQueue(unsigned int workerCount); 23 | ~WorkQueue(); 24 | 25 | void pushItem(std::shared_ptr item, int count = 1); 26 | 27 | template 28 | void pushFunction(F fun, int count = 1) 29 | { 30 | pushItem(std::make_shared(std::move(fun))); 31 | } 32 | 33 | unsigned int getWorkerCount() const 34 | { 35 | return workers.size(); 36 | } 37 | 38 | private: 39 | std::vector workers; 40 | 41 | std::mutex itemsMutex; 42 | std::condition_variable itemsCondition; 43 | std::queue, int>> items; 44 | 45 | static void workerThreadFun(WorkQueue* queue, int worker); 46 | 47 | template 48 | struct ItemFunction: WorkQueue::Item 49 | { 50 | T function; 51 | 52 | ItemFunction(T function): function(std::move(function)) 53 | { 54 | } 55 | 56 | void run(int worker) override 57 | { 58 | function(); 59 | } 60 | }; 61 | 62 | }; -------------------------------------------------------------------------------- /src/base/microprofile.cpp: -------------------------------------------------------------------------------- 1 | #define MICROPROFILE_HELP_ALT "Right-Click" 2 | #define MICROPROFILE_HELP_MOD "Ctrl" 3 | 4 | #ifdef _WIN32 5 | #include "../glad/glad.h" 6 | #endif 7 | 8 | #define GL_GLEXT_PROTOTYPES 9 | #include 10 | 11 | #ifdef __APPLE__ 12 | #define GL_DO_NOT_WARN_IF_MULTI_GL_VERSION_HEADERS_INCLUDED 13 | #include 14 | #endif 15 | 16 | #define MICROPROFILE_WEBSERVER 1 17 | #define MICROPROFILE_GPU_TIMERS_GL 1 18 | 19 | #define MICROPROFILE_IMPL 20 | #include "microprofile.h" 21 | 22 | #define MICROPROFILEUI_IMPL 23 | #include "microprofileui.h" 24 | 25 | #define MICROPROFILEDRAW_IMPL 26 | #include "microprofiledraw.h" 27 | 28 | #ifdef __linux__ 29 | #define GL_PROC(ret, name, args, argcall) \ 30 | ret name args { \ 31 | static ret (*ptr) args = reinterpret_cast(glfwGetProcAddress(#name)); \ 32 | return ptr argcall; \ 33 | } 34 | 35 | GL_PROC(void, glQueryCounter, (GLuint id, GLenum target), (id, target)) 36 | GL_PROC(void, glGetQueryObjectui64v, (GLuint id, GLenum pname, GLuint64 *params), (id, pname, params)) 37 | #endif 38 | -------------------------------------------------------------------------------- /src/glad/khrplatform.h: -------------------------------------------------------------------------------- 1 | #ifndef __khrplatform_h_ 2 | #define __khrplatform_h_ 3 | 4 | /* 5 | ** Copyright (c) 2008-2018 The Khronos Group Inc. 6 | ** 7 | ** Permission is hereby granted, free of charge, to any person obtaining a 8 | ** copy of this software and/or associated documentation files (the 9 | ** "Materials"), to deal in the Materials without restriction, including 10 | ** without limitation the rights to use, copy, modify, merge, publish, 11 | ** distribute, sublicense, and/or sell copies of the Materials, and to 12 | ** permit persons to whom the Materials are furnished to do so, subject to 13 | ** the following conditions: 14 | ** 15 | ** The above copyright notice and this permission notice shall be included 16 | ** in all copies or substantial portions of the Materials. 17 | ** 18 | ** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | ** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 25 | */ 26 | 27 | /* Khronos platform-specific types and definitions. 28 | * 29 | * The master copy of khrplatform.h is maintained in the Khronos EGL 30 | * Registry repository at https://github.com/KhronosGroup/EGL-Registry 31 | * The last semantic modification to khrplatform.h was at commit ID: 32 | * 67a3e0864c2d75ea5287b9f3d2eb74a745936692 33 | * 34 | * Adopters may modify this file to suit their platform. Adopters are 35 | * encouraged to submit platform specific modifications to the Khronos 36 | * group so that they can be included in future versions of this file. 37 | * Please submit changes by filing pull requests or issues on 38 | * the EGL Registry repository linked above. 39 | * 40 | * 41 | * See the Implementer's Guidelines for information about where this file 42 | * should be located on your system and for more details of its use: 43 | * http://www.khronos.org/registry/implementers_guide.pdf 44 | * 45 | * This file should be included as 46 | * #include 47 | * by Khronos client API header files that use its types and defines. 48 | * 49 | * The types in khrplatform.h should only be used to define API-specific types. 50 | * 51 | * Types defined in khrplatform.h: 52 | * khronos_int8_t signed 8 bit 53 | * khronos_uint8_t unsigned 8 bit 54 | * khronos_int16_t signed 16 bit 55 | * khronos_uint16_t unsigned 16 bit 56 | * khronos_int32_t signed 32 bit 57 | * khronos_uint32_t unsigned 32 bit 58 | * khronos_int64_t signed 64 bit 59 | * khronos_uint64_t unsigned 64 bit 60 | * khronos_intptr_t signed same number of bits as a pointer 61 | * khronos_uintptr_t unsigned same number of bits as a pointer 62 | * khronos_ssize_t signed size 63 | * khronos_usize_t unsigned size 64 | * khronos_float_t signed 32 bit floating point 65 | * khronos_time_ns_t unsigned 64 bit time in nanoseconds 66 | * khronos_utime_nanoseconds_t unsigned time interval or absolute time in 67 | * nanoseconds 68 | * khronos_stime_nanoseconds_t signed time interval in nanoseconds 69 | * khronos_boolean_enum_t enumerated boolean type. This should 70 | * only be used as a base type when a client API's boolean type is 71 | * an enum. Client APIs which use an integer or other type for 72 | * booleans cannot use this as the base type for their boolean. 73 | * 74 | * Tokens defined in khrplatform.h: 75 | * 76 | * KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values. 77 | * 78 | * KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0. 79 | * KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0. 80 | * 81 | * Calling convention macros defined in this file: 82 | * KHRONOS_APICALL 83 | * KHRONOS_APIENTRY 84 | * KHRONOS_APIATTRIBUTES 85 | * 86 | * These may be used in function prototypes as: 87 | * 88 | * KHRONOS_APICALL void KHRONOS_APIENTRY funcname( 89 | * int arg1, 90 | * int arg2) KHRONOS_APIATTRIBUTES; 91 | */ 92 | 93 | /*------------------------------------------------------------------------- 94 | * Definition of KHRONOS_APICALL 95 | *------------------------------------------------------------------------- 96 | * This precedes the return type of the function in the function prototype. 97 | */ 98 | #if defined(_WIN32) && !defined(__SCITECH_SNAP__) 99 | # define KHRONOS_APICALL __declspec(dllimport) 100 | #elif defined (__SYMBIAN32__) 101 | # define KHRONOS_APICALL IMPORT_C 102 | #elif defined(__ANDROID__) 103 | # define KHRONOS_APICALL __attribute__((visibility("default"))) 104 | #else 105 | # define KHRONOS_APICALL 106 | #endif 107 | 108 | /*------------------------------------------------------------------------- 109 | * Definition of KHRONOS_APIENTRY 110 | *------------------------------------------------------------------------- 111 | * This follows the return type of the function and precedes the function 112 | * name in the function prototype. 113 | */ 114 | #if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__) 115 | /* Win32 but not WinCE */ 116 | # define KHRONOS_APIENTRY __stdcall 117 | #else 118 | # define KHRONOS_APIENTRY 119 | #endif 120 | 121 | /*------------------------------------------------------------------------- 122 | * Definition of KHRONOS_APIATTRIBUTES 123 | *------------------------------------------------------------------------- 124 | * This follows the closing parenthesis of the function prototype arguments. 125 | */ 126 | #if defined (__ARMCC_2__) 127 | #define KHRONOS_APIATTRIBUTES __softfp 128 | #else 129 | #define KHRONOS_APIATTRIBUTES 130 | #endif 131 | 132 | /*------------------------------------------------------------------------- 133 | * basic type definitions 134 | *-----------------------------------------------------------------------*/ 135 | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__) 136 | 137 | 138 | /* 139 | * Using 140 | */ 141 | #include 142 | typedef int32_t khronos_int32_t; 143 | typedef uint32_t khronos_uint32_t; 144 | typedef int64_t khronos_int64_t; 145 | typedef uint64_t khronos_uint64_t; 146 | #define KHRONOS_SUPPORT_INT64 1 147 | #define KHRONOS_SUPPORT_FLOAT 1 148 | 149 | #elif defined(__VMS ) || defined(__sgi) 150 | 151 | /* 152 | * Using 153 | */ 154 | #include 155 | typedef int32_t khronos_int32_t; 156 | typedef uint32_t khronos_uint32_t; 157 | typedef int64_t khronos_int64_t; 158 | typedef uint64_t khronos_uint64_t; 159 | #define KHRONOS_SUPPORT_INT64 1 160 | #define KHRONOS_SUPPORT_FLOAT 1 161 | 162 | #elif defined(_WIN32) && !defined(__SCITECH_SNAP__) 163 | 164 | /* 165 | * Win32 166 | */ 167 | typedef __int32 khronos_int32_t; 168 | typedef unsigned __int32 khronos_uint32_t; 169 | typedef __int64 khronos_int64_t; 170 | typedef unsigned __int64 khronos_uint64_t; 171 | #define KHRONOS_SUPPORT_INT64 1 172 | #define KHRONOS_SUPPORT_FLOAT 1 173 | 174 | #elif defined(__sun__) || defined(__digital__) 175 | 176 | /* 177 | * Sun or Digital 178 | */ 179 | typedef int khronos_int32_t; 180 | typedef unsigned int khronos_uint32_t; 181 | #if defined(__arch64__) || defined(_LP64) 182 | typedef long int khronos_int64_t; 183 | typedef unsigned long int khronos_uint64_t; 184 | #else 185 | typedef long long int khronos_int64_t; 186 | typedef unsigned long long int khronos_uint64_t; 187 | #endif /* __arch64__ */ 188 | #define KHRONOS_SUPPORT_INT64 1 189 | #define KHRONOS_SUPPORT_FLOAT 1 190 | 191 | #elif 0 192 | 193 | /* 194 | * Hypothetical platform with no float or int64 support 195 | */ 196 | typedef int khronos_int32_t; 197 | typedef unsigned int khronos_uint32_t; 198 | #define KHRONOS_SUPPORT_INT64 0 199 | #define KHRONOS_SUPPORT_FLOAT 0 200 | 201 | #else 202 | 203 | /* 204 | * Generic fallback 205 | */ 206 | #include 207 | typedef int32_t khronos_int32_t; 208 | typedef uint32_t khronos_uint32_t; 209 | typedef int64_t khronos_int64_t; 210 | typedef uint64_t khronos_uint64_t; 211 | #define KHRONOS_SUPPORT_INT64 1 212 | #define KHRONOS_SUPPORT_FLOAT 1 213 | 214 | #endif 215 | 216 | 217 | /* 218 | * Types that are (so far) the same on all platforms 219 | */ 220 | typedef signed char khronos_int8_t; 221 | typedef unsigned char khronos_uint8_t; 222 | typedef signed short int khronos_int16_t; 223 | typedef unsigned short int khronos_uint16_t; 224 | 225 | /* 226 | * Types that differ between LLP64 and LP64 architectures - in LLP64, 227 | * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears 228 | * to be the only LLP64 architecture in current use. 229 | */ 230 | #ifdef _WIN64 231 | typedef signed long long int khronos_intptr_t; 232 | typedef unsigned long long int khronos_uintptr_t; 233 | typedef signed long long int khronos_ssize_t; 234 | typedef unsigned long long int khronos_usize_t; 235 | #else 236 | typedef signed long int khronos_intptr_t; 237 | typedef unsigned long int khronos_uintptr_t; 238 | typedef signed long int khronos_ssize_t; 239 | typedef unsigned long int khronos_usize_t; 240 | #endif 241 | 242 | #if KHRONOS_SUPPORT_FLOAT 243 | /* 244 | * Float type 245 | */ 246 | typedef float khronos_float_t; 247 | #endif 248 | 249 | #if KHRONOS_SUPPORT_INT64 250 | /* Time types 251 | * 252 | * These types can be used to represent a time interval in nanoseconds or 253 | * an absolute Unadjusted System Time. Unadjusted System Time is the number 254 | * of nanoseconds since some arbitrary system event (e.g. since the last 255 | * time the system booted). The Unadjusted System Time is an unsigned 256 | * 64 bit value that wraps back to 0 every 584 years. Time intervals 257 | * may be either signed or unsigned. 258 | */ 259 | typedef khronos_uint64_t khronos_utime_nanoseconds_t; 260 | typedef khronos_int64_t khronos_stime_nanoseconds_t; 261 | #endif 262 | 263 | /* 264 | * Dummy value used to pad enum types to 32 bits. 265 | */ 266 | #ifndef KHRONOS_MAX_ENUM 267 | #define KHRONOS_MAX_ENUM 0x7FFFFFFF 268 | #endif 269 | 270 | /* 271 | * Enumerated boolean type 272 | * 273 | * Values other than zero should be considered to be true. Therefore 274 | * comparisons should not be made against KHRONOS_TRUE. 275 | */ 276 | typedef enum { 277 | KHRONOS_FALSE = 0, 278 | KHRONOS_TRUE = 1, 279 | KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM 280 | } khronos_boolean_enum_t; 281 | 282 | #endif /* __khrplatform_h_ */ 283 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include "glad/glad.h" 3 | #endif 4 | 5 | #include 6 | 7 | #include "World.h" 8 | #include "Configuration.h" 9 | 10 | #include "base/WorkQueue.h" 11 | 12 | #include "microprofile/microprofile.h" 13 | #include "microprofile/microprofileui.h" 14 | #include "microprofile/microprofiledraw.h" 15 | 16 | struct Vertex 17 | { 18 | Vector2f position; 19 | unsigned char r, g, b, a; 20 | 21 | }; 22 | 23 | void RenderBox(std::vector& vertices, Coords2f coords, Vector2f size, int r, int g, int b, int a) 24 | { 25 | Vector2f axisX = coords.xVector * size.x; 26 | Vector2f axisY = coords.yVector * size.y; 27 | 28 | Vertex v; 29 | 30 | v.r = r; 31 | v.g = g; 32 | v.b = b; 33 | v.a = a; 34 | 35 | v.position = coords.pos - axisX - axisY; 36 | vertices.push_back(v); 37 | 38 | v.position = coords.pos + axisX - axisY; 39 | vertices.push_back(v); 40 | 41 | v.position = coords.pos + axisX + axisY; 42 | vertices.push_back(v); 43 | 44 | v.position = coords.pos - axisX + axisY; 45 | vertices.push_back(v); 46 | } 47 | 48 | float random(float min, float max) 49 | { 50 | return min + (max - min) * (float(rand()) / float(RAND_MAX)); 51 | } 52 | 53 | const struct 54 | { 55 | Configuration::SolveMode mode; 56 | const char* name; 57 | } kSolveModes[] = 58 | { 59 | {Configuration::Solve_Scalar, "Scalar"}, 60 | 61 | #ifdef __SSE2__ 62 | {Configuration::Solve_SSE2, "SSE2"}, 63 | #endif 64 | 65 | #ifdef __AVX2__ 66 | {Configuration::Solve_AVX2, "AVX2"}, 67 | #endif 68 | }; 69 | 70 | const struct 71 | { 72 | Configuration::IslandMode mode; 73 | const char* name; 74 | } kIslandModes[] = 75 | { 76 | {Configuration::Island_Single, "Single"}, 77 | {Configuration::Island_Multiple, "Multiple"}, 78 | {Configuration::Island_SingleSloppy, "Single Sloppy"}, 79 | {Configuration::Island_MultipleSloppy, "Multiple Sloppy"}, 80 | }; 81 | 82 | const char* resetWorld(World& world, int scene) 83 | { 84 | MICROPROFILE_SCOPEI("Init", "resetWorld", -1); 85 | 86 | world.bodies.clear(); 87 | world.collider.manifolds.clear(); 88 | world.collider.manifoldMap.clear(); 89 | world.solver.contactJoints.clear(); 90 | 91 | RigidBody* groundBody = world.AddBody(Coords2f(Vector2f(0, 0), 0.0f), Vector2f(10000.f, 10.0f)); 92 | groundBody->invInertia = 0.0f; 93 | groundBody->invMass = 0.0f; 94 | 95 | world.AddBody(Coords2f(Vector2f(-1000, 1500), 0.0f), Vector2f(30.0f, 30.0f)); 96 | 97 | switch (scene % 8) 98 | { 99 | case 0: 100 | { 101 | for (int bodyIndex = 0; bodyIndex < 20000; bodyIndex++) 102 | { 103 | Vector2f pos = Vector2f(random(-500.0f, 500.0f), random(50.f, 1000.0f)); 104 | Vector2f size(4.f, 4.f); 105 | 106 | world.AddBody(Coords2f(pos, 0.f), size); 107 | } 108 | 109 | return "Falling"; 110 | } 111 | 112 | case 1: 113 | { 114 | for (int left = -100; left <= 100; left++) 115 | { 116 | for (int bodyIndex = 0; bodyIndex < 100; bodyIndex++) 117 | { 118 | Vector2f pos = Vector2f(left * 20, 10 + bodyIndex * 10); 119 | Vector2f size(10, 5); 120 | 121 | world.AddBody(Coords2f(pos, 0.f), size); 122 | } 123 | } 124 | 125 | return "Wall"; 126 | } 127 | 128 | case 2: 129 | { 130 | for (int step = 0; step < 100; ++step) 131 | { 132 | Vector2f pos = Vector2f(0, 1005 - step * 10); 133 | Vector2f size(10 + step * 5, 5); 134 | 135 | world.AddBody(Coords2f(pos, 0.f), size); 136 | } 137 | 138 | return "Pyramid"; 139 | } 140 | 141 | case 3: 142 | { 143 | for (int step = 0; step < 100; ++step) 144 | { 145 | Vector2f pos = Vector2f(0, 15 + step * 10); 146 | Vector2f size(10 + step * 5, 5); 147 | 148 | world.AddBody(Coords2f(pos, 0.f), size); 149 | } 150 | 151 | return "Reverse Pyramid"; 152 | } 153 | 154 | case 4: 155 | { 156 | for (int left = -100; left <= 100; left++) 157 | { 158 | for (int bodyIndex = 0; bodyIndex < 150; bodyIndex++) 159 | { 160 | Vector2f pos = Vector2f(left * 15, 15 + bodyIndex * 10); 161 | Vector2f size(5 - bodyIndex * 0.03f, 5); 162 | 163 | world.AddBody(Coords2f(pos, 0.f), size); 164 | } 165 | } 166 | 167 | return "Stacks"; 168 | } 169 | 170 | case 5: 171 | { 172 | world.AddBody(Coords2f(Vector2f(0.f, 400.f), 0.f), Vector2f(600.f, 10.f))->invMass = 0.f; 173 | world.AddBody(Coords2f(Vector2f(800.f, 200.f), 0.f), Vector2f(400.f, 10.f))->invMass = 0.f; 174 | 175 | for (int bodyIndex = 0; bodyIndex < 20000; bodyIndex++) 176 | { 177 | Vector2f pos = Vector2f(random(0.0f, 500.0f), random(500.f, 2500.0f)); 178 | Vector2f size(4.f, 4.f); 179 | 180 | world.AddBody(Coords2f(pos, 0.f), size); 181 | } 182 | 183 | return "Stacks"; 184 | } 185 | 186 | case 6: 187 | { 188 | world.AddBody(Coords2f(Vector2f(0.f, 400.f), 0.f), Vector2f(600.f, 10.f))->invMass = 0.f; 189 | world.AddBody(Coords2f(Vector2f(800.f, 200.f), 0.f), Vector2f(400.f, 10.f))->invMass = 0.f; 190 | 191 | RigidBody* body = world.AddBody(Coords2f(Vector2f(500.f, 500.f), -0.5f), Vector2f(600.f, 10.f)); 192 | body->invMass = 0.f; 193 | body->invInertia = 0.f; 194 | 195 | for (int bodyIndex = 0; bodyIndex < 10000; bodyIndex++) 196 | { 197 | Vector2f pos1 = Vector2f(random(200.0f, 500.0f), random(500.f, 2500.0f)); 198 | Vector2f pos2 = Vector2f(random(-500.0f, -200.0f), random(500.f, 2500.0f)); 199 | Vector2f size(4.f, 4.f); 200 | 201 | world.AddBody(Coords2f(pos1, 0.f), size); 202 | world.AddBody(Coords2f(pos2, 0.f), size); 203 | } 204 | 205 | return "Dual Stacks"; 206 | } 207 | 208 | case 7: 209 | { 210 | for (int group = -5; group <= 5; ++group) 211 | { 212 | RigidBody* splitter = world.AddBody(Coords2f(Vector2f(group * 300, 500.f), 0.f), Vector2f(20.f, 1000.f)); 213 | splitter->invMass = 0.f; 214 | splitter->invInertia = 0.f; 215 | 216 | for (int bodyIndex = 0; bodyIndex < 4500; bodyIndex++) 217 | { 218 | Vector2f pos = Vector2f(group * 300 + random(50.f, 250.0f), random(50.f, 1500.0f)); 219 | Vector2f size(4.f, 4.f); 220 | 221 | world.AddBody(Coords2f(pos, 0.f), size); 222 | } 223 | } 224 | 225 | return "Islands"; 226 | } 227 | } 228 | 229 | return "Empty"; 230 | } 231 | 232 | bool keyPressed[GLFW_KEY_LAST + 1]; 233 | int mouseScrollDelta = 0; 234 | 235 | static void errorCallback(int error, const char* description) 236 | { 237 | fputs(description, stderr); 238 | } 239 | 240 | static void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) 241 | { 242 | keyPressed[key] = (action == GLFW_PRESS); 243 | } 244 | 245 | static void scrollCallback(GLFWwindow* window, double x, double y) 246 | { 247 | mouseScrollDelta = y; 248 | } 249 | 250 | int main(int argc, char** argv) 251 | { 252 | MicroProfileOnThreadCreate("Main"); 253 | MicroProfileSetEnableAllGroups(true); 254 | MicroProfileSetForceMetaCounters(true); 255 | 256 | int windowWidth = 1280, windowHeight = 1024; 257 | 258 | std::unique_ptr queue(new WorkQueue(WorkQueue::getIdealWorkerCount() - 1)); 259 | 260 | World world; 261 | 262 | int currentSolveMode = sizeof(kSolveModes) / sizeof(kSolveModes[0]) - 1; 263 | int currentIslandMode = sizeof(kIslandModes) / sizeof(kIslandModes[0]) - 1; 264 | int currentScene = 0; 265 | 266 | const char* currentSceneName = resetWorld(world, currentScene); 267 | 268 | const float gravity = -200.0f; 269 | const float integrationTime = 1 / 60.f; 270 | 271 | world.gravity = gravity; 272 | 273 | glfwSetErrorCallback(errorCallback); 274 | 275 | if (!glfwInit()) return 1; 276 | 277 | GLFWwindow* window = glfwCreateWindow(windowWidth, windowHeight, "PhyX", NULL, NULL); 278 | if (!window) return 1; 279 | 280 | glfwMakeContextCurrent(window); 281 | glfwSwapInterval(1); 282 | glfwSetKeyCallback(window, keyCallback); 283 | glfwSetScrollCallback(window, scrollCallback); 284 | 285 | #ifdef _WIN32 286 | if (!gladLoadGL()) return 2; 287 | #endif 288 | 289 | MicroProfileDrawInitGL(); 290 | MicroProfileGpuInitGL(); 291 | 292 | bool paused = false; 293 | 294 | double prevUpdateTime = 0.0f; 295 | 296 | std::vector vertices; 297 | 298 | float viewOffsetX = -500; 299 | float viewOffsetY = -40; 300 | float viewScale = 0.5f; 301 | 302 | int frameIndex = 0; 303 | 304 | while (!glfwWindowShouldClose(window)) 305 | { 306 | MicroProfileFlip(); 307 | 308 | MICROPROFILE_SCOPEI("MAIN", "Frame", 0xffee00); 309 | 310 | MICROPROFILE_LABELF("MAIN", "Index %d", frameIndex++); 311 | 312 | int width, height; 313 | glfwGetWindowSize(window, &width, &height); 314 | 315 | int frameWidth, frameHeight; 316 | glfwGetFramebufferSize(window, &frameWidth, &frameHeight); 317 | 318 | double mouseX, mouseY; 319 | glfwGetCursorPos(window, &mouseX, &mouseY); 320 | 321 | glViewport(0, 0, frameWidth, frameHeight); 322 | glClearColor(0.2f, 0.2f, 0.2f, 1.f); 323 | glClear(GL_COLOR_BUFFER_BIT); 324 | 325 | glMatrixMode(GL_PROJECTION); 326 | glLoadIdentity(); 327 | glOrtho(viewOffsetX / viewScale, width / viewScale + viewOffsetX / viewScale, viewOffsetY / viewScale, height / viewScale + viewOffsetY / viewScale, 1.f, -1.f); 328 | 329 | vertices.clear(); 330 | 331 | if (glfwGetTime() > prevUpdateTime + integrationTime) 332 | { 333 | prevUpdateTime += integrationTime; 334 | 335 | if (!paused) 336 | { 337 | RigidBody* draggedBody = &world.bodies[1]; 338 | Vector2f dragTarget = 339 | glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_RIGHT) 340 | ? Vector2f(mouseX + viewOffsetX, height + viewOffsetY - mouseY) / viewScale 341 | : draggedBody->coords.pos; 342 | 343 | Vector2f dstVelocity = (dragTarget - draggedBody->coords.pos) * 5e1f; 344 | 345 | draggedBody->acceleration.y -= gravity; 346 | draggedBody->acceleration += (dstVelocity - draggedBody->velocity) * 5e0; 347 | 348 | Configuration config = { kSolveModes[currentSolveMode].mode, kIslandModes[currentIslandMode].mode, 15, 15 }; 349 | world.Update(*queue, integrationTime, config); 350 | } 351 | } 352 | 353 | char stats[256]; 354 | sprintf(stats, "Scene: %s | Bodies: %d Manifolds: %d Contacts: %d Islands: %d (biggest: %d) | Cores: %d; Solve: %s; Island: %s; Iterations: %.2f", 355 | currentSceneName, 356 | int(world.bodies.size), 357 | int(world.collider.manifolds.size), 358 | int(world.solver.contactJoints.size), 359 | int(world.solver.islandCount), 360 | int(world.solver.islandMaxSize), 361 | int(queue->getWorkerCount() + 1), 362 | kSolveModes[currentSolveMode].name, 363 | kIslandModes[currentIslandMode].name, 364 | 0.f); 365 | 366 | { 367 | MICROPROFILE_SCOPEI("Render", "Render", 0xff0000); 368 | 369 | { 370 | MICROPROFILE_SCOPEI("Render", "Prepare", -1); 371 | 372 | for (int bodyIndex = 0; bodyIndex < world.bodies.size; bodyIndex++) 373 | { 374 | RigidBody* body = &world.bodies[bodyIndex]; 375 | Coords2f bodyCoords = body->coords; 376 | Vector2f size = body->geom.size; 377 | 378 | float colorMult = float(bodyIndex) / float(world.bodies.size) * 0.5f + 0.5f; 379 | int r = 50 * colorMult; 380 | int g = 125 * colorMult; 381 | int b = 218 * colorMult; 382 | 383 | if (bodyIndex == 1) //dragged body 384 | { 385 | r = 242; 386 | g = 236; 387 | b = 164; 388 | } 389 | 390 | RenderBox(vertices, bodyCoords, size, r, g, b, 255); 391 | } 392 | 393 | if (glfwGetKey(window, GLFW_KEY_V)) 394 | { 395 | for (int manifoldIndex = 0; manifoldIndex < world.collider.manifolds.size; manifoldIndex++) 396 | { 397 | Manifold& man = world.collider.manifolds[manifoldIndex]; 398 | 399 | for (int collisionNumber = 0; collisionNumber < man.pointCount; collisionNumber++) 400 | { 401 | ContactPoint& cp = world.collider.contactPoints[man.pointIndex + collisionNumber]; 402 | 403 | Coords2f coords = Coords2f(Vector2f(0.0f, 0.0f), 3.1415f / 4.0f); 404 | 405 | coords.pos = world.bodies[man.body1Index].coords.pos + cp.delta1; 406 | 407 | float redMult = cp.isNewlyCreated ? 0.5f : 1.0f; 408 | 409 | RenderBox(vertices, coords, Vector2f(3.0f, 3.0f), 100, 100 * redMult, 100 * redMult, 100); 410 | 411 | coords.pos = world.bodies[man.body2Index].coords.pos + cp.delta2; 412 | 413 | RenderBox(vertices, coords, Vector2f(3.0f, 3.0f), 150, 150 * redMult, 150 * redMult, 100); 414 | } 415 | } 416 | } 417 | } 418 | 419 | { 420 | MICROPROFILE_SCOPEI("Render", "Perform", -1); 421 | MICROPROFILE_SCOPEGPUI("Scene", -1); 422 | 423 | if (!vertices.empty()) 424 | { 425 | glEnableClientState(GL_VERTEX_ARRAY); 426 | glEnableClientState(GL_COLOR_ARRAY); 427 | 428 | glVertexPointer(2, GL_FLOAT, sizeof(Vertex), &vertices[0].position); 429 | glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(Vertex), &vertices[0].r); 430 | 431 | glDrawArrays(GL_QUADS, 0, vertices.size()); 432 | 433 | glDisableClientState(GL_VERTEX_ARRAY); 434 | glDisableClientState(GL_COLOR_ARRAY); 435 | } 436 | } 437 | 438 | { 439 | MICROPROFILE_SCOPEI("Render", "Profile", -1); 440 | MICROPROFILE_SCOPEGPUI("Profile", -1); 441 | 442 | MicroProfileBeginDraw(width, height, 1.f); 443 | 444 | MicroProfileDraw(width, height); 445 | MicroProfileDrawText(2, height - 12, 0xffffffff, stats, strlen(stats)); 446 | 447 | MicroProfileEndDraw(); 448 | } 449 | } 450 | 451 | MICROPROFILE_COUNTER_ADD("frame/count", 1); 452 | 453 | { 454 | MICROPROFILE_SCOPEI("MAIN", "Flip", 0xffee00); 455 | 456 | glfwSwapBuffers(window); 457 | } 458 | 459 | { 460 | MICROPROFILE_SCOPEI("MAIN", "Input", 0xffee00); 461 | 462 | // Handle input 463 | memset(keyPressed, 0, sizeof(keyPressed)); 464 | mouseScrollDelta = 0; 465 | 466 | glfwPollEvents(); 467 | 468 | bool mouseDown0 = glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_LEFT) == GLFW_PRESS; 469 | bool mouseDown1 = glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_RIGHT) == GLFW_PRESS; 470 | 471 | MicroProfileMouseButton(mouseDown0, mouseDown1); 472 | MicroProfileMousePosition(mouseX, mouseY, mouseScrollDelta); 473 | MicroProfileModKey(glfwGetKey(window, GLFW_KEY_LEFT_SHIFT) == GLFW_PRESS); 474 | 475 | if (keyPressed[GLFW_KEY_ESCAPE]) 476 | break; 477 | 478 | if (keyPressed[GLFW_KEY_O]) 479 | MicroProfileToggleDisplayMode(); 480 | 481 | if (keyPressed[GLFW_KEY_P]) 482 | { 483 | paused = !paused; 484 | MicroProfileTogglePause(); 485 | } 486 | 487 | if (keyPressed[GLFW_KEY_I]) 488 | currentIslandMode = (currentIslandMode + 1) % (sizeof(kIslandModes) / sizeof(kIslandModes[0])); 489 | 490 | if (keyPressed[GLFW_KEY_M]) 491 | currentSolveMode = (currentSolveMode + 1) % (sizeof(kSolveModes) / sizeof(kSolveModes[0])); 492 | 493 | if (keyPressed[GLFW_KEY_R]) 494 | currentSceneName = resetWorld(world, currentScene); 495 | 496 | if (keyPressed[GLFW_KEY_S]) 497 | currentSceneName = resetWorld(world, ++currentScene); 498 | 499 | if (keyPressed[GLFW_KEY_C]) 500 | { 501 | unsigned int cores = queue->getWorkerCount() + 1; 502 | unsigned int newcores = 503 | (cores == WorkQueue::getIdealWorkerCount()) 504 | ? 1 505 | : std::min(cores * 2, WorkQueue::getIdealWorkerCount()); 506 | 507 | queue.reset(new WorkQueue(newcores - 1)); 508 | } 509 | 510 | if (glfwGetKey(window, GLFW_KEY_LEFT)) 511 | viewOffsetX -= 10; 512 | 513 | if (glfwGetKey(window, GLFW_KEY_RIGHT)) 514 | viewOffsetX += 10; 515 | 516 | if (glfwGetKey(window, GLFW_KEY_UP)) 517 | viewScale *= 1.05f; 518 | 519 | if (glfwGetKey(window, GLFW_KEY_DOWN)) 520 | viewScale /= 1.05f; 521 | } 522 | } 523 | 524 | glfwDestroyWindow(window); 525 | glfwTerminate(); 526 | 527 | MicroProfileShutdown(); 528 | } --------------------------------------------------------------------------------