├── .clang-format
├── .gitignore
├── .gitmodules
├── LICENSE.md
├── Makefile
├── README.md
├── packages.config
├── phyx.sln
├── phyx.vcxproj
├── phyx.vcxproj.filters
└── src
    ├── AABB2.h
    ├── Collider.cpp
    ├── Collider.h
    ├── Configuration.h
    ├── Coords2.h
    ├── Geom.h
    ├── Joints.h
    ├── Manifold.h
    ├── RigidBody.h
    ├── Solver.cpp
    ├── Solver.h
    ├── Vector2.h
    ├── World.cpp
    ├── World.h
    ├── base
        ├── AlignedArray.h
        ├── DenseHash.h
        ├── Parallel.h
        ├── RadixSort.h
        ├── SIMD.h
        ├── SIMD_AVX2.h
        ├── SIMD_AVX2_Transpose.h
        ├── SIMD_SSE2.h
        ├── SIMD_Scalar.h
        ├── WorkQueue.cpp
        ├── WorkQueue.h
        └── microprofile.cpp
    ├── glad
        ├── glad.c
        ├── glad.h
        └── khrplatform.h
    └── main.cpp


/.clang-format:
--------------------------------------------------------------------------------
1 | UseTab: Never
2 | IndentWidth: 4
3 | BreakBeforeBraces: Allman
4 | AllowShortIfStatementsOnASingleLine: true
5 | IndentCaseLabels: false
6 | ColumnLimit: 0
7 | PointerAlignment: Left
8 | BreakConstructorInitializersBeforeComma: true


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .microprofilepreset.*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/microprofile"]
2 | 	path = src/microprofile
3 | 	url = git@github.com:zeux/microprofile.git
4 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Alexander Sannikov, Arseny Kapoulkine
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .SUFFIXES:
 2 | MAKEFLAGS+=-r
 3 | 
 4 | BUILD=build
 5 | 
 6 | SOURCES=$(wildcard src/*.cpp src/base/*.cpp)
 7 | OBJECTS=$(SOURCES:%=$(BUILD)/%.o)
 8 | 
 9 | EXECUTABLE=$(BUILD)/phyx
10 | 
11 | CXXFLAGS=-g -Wall -std=c++11 -O3 -DNDEBUG -ffast-math -Isrc/microprofile
12 | LDFLAGS=
13 | 
14 | ifeq ($(shell uname),Darwin)
15 | CXXFLAGS+=-mavx2 -mfma
16 | LDFLAGS+=-lglfw3 -framework OpenGL
17 | else
18 | CPUINFO=$(shell cat /proc/cpuinfo)
19 | ifneq ($(findstring avx2,$(CPUINFO)),)
20 | CXXFLAGS+=-mavx2
21 | endif
22 | ifneq ($(findstring fma,$(CPUINFO)),)
23 | CXXFLAGS+=-mfma
24 | endif
25 | LDFLAGS+=-lglfw -lGL -lpthread
26 | endif
27 | 
28 | all: $(EXECUTABLE)
29 | 	./$(EXECUTABLE)
30 | 
31 | $(EXECUTABLE): $(OBJECTS)
32 | 	$(CXX) $(OBJECTS) $(LDFLAGS) -o $@
33 | 
34 | $(BUILD)/%.o: %
35 | 	@mkdir -p $(dir $@)
36 | 	$(CXX) $< $(CXXFLAGS) -c -MMD -MP -o $@
37 | 
38 | -include $(OBJECTS:.o=.d)
39 | clean:
40 | 	rm -rf $(BUILD)
41 | 
42 | .PHONY: all clean
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PhyX
 2 | 
 3 | ## Overview
 4 | 
 5 | This is a 2D physics engine with SoA/SIMD/multicore optimizations. The primary purpose of this project is to experiment with various multi-threading and SIMD algorithms - it is not meant to be a production-ready 2D physics engine.
 6 | 
 7 | The engine is based on SusliX by Alexander Sannikov aka Suslik &lt;i.make.physics at gmail.com&gt; \
 8 | The optimizations are implemented by Arseny Kapoulkine &lt;arseny.kapoulkine@gmail.com&gt;
 9 | 
10 | The code is licensed under the MIT license.
11 | 
12 | ## Video
13 | 
14 | [![Massive Rigid Body Simulation](https://img.youtube.com/vi/2-UZkEjnBu4/0.jpg)](https://www.youtube.com/watch?v=2-UZkEjnBu4)
15 | 
16 | ## Building
17 | 
18 | To build the project, you first have to clone it using `--recursive` option to fetch the [microprofile](https://github.com/zeux/microprofile) submodule.
19 | 
20 | On Linux/Mac, use `make` to build the project, and `make run` to run the demo. Makefile automatically picks the optimal compilation settings for the host system, using AVX2/FMA as appropriate.
21 | 
22 | On Windows, open `phyx.sln` in Visual Studio 2017 and build & run from there. Note that the project file is configured to assume AVX2 support; if your system doesn't have AVX2, you will need to disable it by removing `__AVX2__` from preprocessor defines and switching the code generation instruction set to "Streaming SIMD Extensions 2 (/arch:SSE2)" - both modifications can be done in project properties.
23 | 
24 | ## Features
25 | 
26 | The engine implements a traditional physics pipeline - broadphase, narrowphase, contact pairing/caching, island splitting, solve (using sequential impulses).
27 | 
28 | There is only one collision primitive - box. It should be straightforward to add support for more primitives, as long as each manifold has a small number of contact points.
29 | 
30 | There is only one constraint type - contact. It should be possible to add support for more constraint types, although that might require extensive changes to some algorithms, in particular SIMD.
31 | 
32 | The broadphase algorithm is single-axis sweep&prune, using non-incremental 3-pass radix sort. Incremental algorithms tend to be expensive when a lot of updates are performed, and sweep&prune is a good fit for 2D assuming the worlds are mostly horizontal.
33 | 
34 | ## Controls
35 | 
36 | In the demo you can use several keys to switch between various modes and cycle between scenes:
37 | 
38 | * Left/Right: move the viewport
39 | * Up/Down: scale the viewport
40 | * S: Switch to the next scene; different scenes test different configurations of bodies, stress testing various parts of the pipeline
41 | * R: Reset current scene
42 | * I: Switch island mode (see below)
43 | * M: Switch solve mode (see below)
44 | * C: Switch the number of cores the solver uses (1, 2, 4, 8, etc. - up to the number of logical core counts on the machine)
45 | * P: Pause simulation
46 | * O: Cycle between various microprofile display views
47 | 
48 | ## Island splitting
49 | 
50 | You can switch between different island construction modes by using the `I` key:
51 | 
52 | * Single: no island splitting is performed, constraint solving is effectively single-threaded.
53 | * Multiple: objects are split into islands, each island is solved serially. Island splitting can be expensive for complex constraint graphs.
54 | * Single Sloppy: no island splitting is performed, constraint solving is multi-threaded. Each internal solve step is serialized, which makes sure that - barring rare race conditions - impulse propagation is still effective.
55 | * Multiple Sloppy: objects are split into islands, constraing solving within one island is multi-threaded. Compared to Single Sloppy, requires (potentially expensive) island splitting, but preserves mechanism integrity for small islands.
56 | 
57 | ## SIMD
58 | 
59 | The code is using a custom SIMD library and templated code that enables SIMD computations with both SSE2 (4-wide) and AVX2 (8-wide) with the same codebase. You can switch between different SIMD widths - 1, 4, 8 - using the `M` key.
60 | 
61 | The library interface is structured to make it easy to write complex algebraic code, including conditions:
62 | 
63 | ```c++
64 | Vf dv = -bounce * (relativeVelocityX * collision_normalX + relativeVelocityY * collision_normalY);
65 | Vf depth = (point2X - point1X) * collision_normalX + (point2Y - point1Y) * collision_normalY;
66 | 
67 | Vf dstVelocity = max(dv - deltaVelocity, Vf::zero());
68 | 
69 | Vf j_normalLimiter_dstVelocity = select(dstVelocity, dstVelocity - maxPenetrationVelocity, depth < deltaDepth);
70 | Vf j_normalLimiter_dstDisplacingVelocity = errorReduction * max(Vf::zero(), depth - Vf::one(2.0f) * deltaDepth);
71 | Vf j_normalLimiter_accumulatedDisplacingImpulse = Vf::zero();
72 | ```
73 | 
74 | To be able to efficiently use SIMD, we split islands into groups of N independent constraints (that affect 2\*N bodies), where N is the SIMD width. The constraint data is packed into AoSoA arrays (array of structure of arrays), otherwise known as block SoA where the block size matches SIMD width and each field of each vector is scalarized so that we can efficiently load and store them without a need to transpose. This structure is maintained throughout all internal iterations of the solver.
75 | 
76 | ## Threading
77 | 
78 | The code is using a pretty standard thread pool implementation with a single queue for items. This thread pool is used for distributing work across worker threads - the batches of work are very large to compensate for the inefficiency of locking/notification mechanisms, for example "one island" or "all contact points".
79 | 
80 | On top of that we implement a `parallelFor` primitive that uses a shared atomic variable to concurrently process a large array of items. To reduce contention for the shared atomic, the arrays are split into small groups for batched processing. The code for using parallel for looks like this:
81 | 
82 | ```c++
83 | parallelFor(queue, 0, jointEnd - jointBegin, 128, [&](int i, int) {
84 |     ContactJoint& joint = contactJoints[joint_index[jointBegin + i]];
85 | 
86 |     ContactJointPacked<N>& jointP = joint_packed[unsigned(jointBegin + i) / N];
87 |     int iP = i & (N - 1);
88 | 
89 |     jointP.body1Index[iP] = joint.body1Index;
90 |     jointP.body2Index[iP] = joint.body2Index;
91 |     jointP.contactPointIndex[iP] = joint.contactPointIndex;
92 | 
93 |     jointP.normalLimiter_accumulatedImpulse[iP] = joint.normalLimiter_accumulatedImpulse;
94 |     jointP.frictionLimiter_accumulatedImpulse[iP] = joint.frictionLimiter_accumulatedImpulse;
95 | });
96 | ```
97 | 
98 | A bit of care is required to find optimal group size for each for loop; additionally, for small arrays it might be worthwhile to implement a serial path - although in case of the code above, `parallelFor` automatically serializes execution if there is only one group - that is, if the number of joints (`jointEnd - jointBegin`) is less than or equal to 128.
99 | 


--------------------------------------------------------------------------------
/packages.config:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <packages>
3 |   <package id="glfw" version="3.2.1.5" targetFramework="native" />
4 | </packages>


--------------------------------------------------------------------------------
/phyx.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 15
 4 | VisualStudioVersion = 15.0.28307.106
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "phyx", "phyx.vcxproj", "{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Debug|x86 = Debug|x86
12 | 		Release|x64 = Release|x64
13 | 		Release|x86 = Release|x86
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x64.Build.0 = Debug|x64
18 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x86.ActiveCfg = Debug|Win32
19 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Debug|x86.Build.0 = Debug|Win32
20 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x64.ActiveCfg = Release|x64
21 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x64.Build.0 = Release|x64
22 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x86.ActiveCfg = Release|Win32
23 | 		{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}.Release|x86.Build.0 = Release|Win32
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | 	GlobalSection(ExtensibilityGlobals) = postSolution
29 | 		SolutionGuid = {9762CF5D-3DD5-4853-8CF4-4B05F956CFB0}
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/phyx.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <PropertyGroup Label="Globals">
 22 |     <VCProjectVersion>15.0</VCProjectVersion>
 23 |     <ProjectGuid>{40C91DE2-9442-45F0-8BB5-DC8CC4123A3E}</ProjectGuid>
 24 |     <RootNamespace>phyx</RootNamespace>
 25 |     <WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
 26 |   </PropertyGroup>
 27 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 28 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 29 |     <ConfigurationType>Application</ConfigurationType>
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |     <PlatformToolset>v141</PlatformToolset>
 32 |     <CharacterSet>MultiByte</CharacterSet>
 33 |   </PropertyGroup>
 34 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 35 |     <ConfigurationType>Application</ConfigurationType>
 36 |     <UseDebugLibraries>false</UseDebugLibraries>
 37 |     <PlatformToolset>v141</PlatformToolset>
 38 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 39 |     <CharacterSet>MultiByte</CharacterSet>
 40 |   </PropertyGroup>
 41 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 42 |     <ConfigurationType>Application</ConfigurationType>
 43 |     <UseDebugLibraries>true</UseDebugLibraries>
 44 |     <PlatformToolset>v141</PlatformToolset>
 45 |     <CharacterSet>MultiByte</CharacterSet>
 46 |   </PropertyGroup>
 47 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 48 |     <ConfigurationType>Application</ConfigurationType>
 49 |     <UseDebugLibraries>false</UseDebugLibraries>
 50 |     <PlatformToolset>v141</PlatformToolset>
 51 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 52 |     <CharacterSet>MultiByte</CharacterSet>
 53 |   </PropertyGroup>
 54 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 55 |   <ImportGroup Label="ExtensionSettings">
 56 |   </ImportGroup>
 57 |   <ImportGroup Label="Shared">
 58 |   </ImportGroup>
 59 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 60 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 61 |   </ImportGroup>
 62 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 63 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 64 |   </ImportGroup>
 65 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 66 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 67 |   </ImportGroup>
 68 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 69 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 70 |   </ImportGroup>
 71 |   <PropertyGroup Label="UserMacros" />
 72 |   <PropertyGroup />
 73 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 74 |     <ClCompile>
 75 |       <WarningLevel>Level3</WarningLevel>
 76 |       <Optimization>Disabled</Optimization>
 77 |       <SDLCheck>true</SDLCheck>
 78 |       <ConformanceMode>true</ConformanceMode>
 79 |       <AdditionalIncludeDirectories>src/microprofile</AdditionalIncludeDirectories>
 80 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 81 |       <FloatingPointModel>Fast</FloatingPointModel>
 82 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 83 |     </ClCompile>
 84 |     <Link>
 85 |       <AdditionalDependencies>opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 86 |     </Link>
 87 |   </ItemDefinitionGroup>
 88 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 89 |     <ClCompile>
 90 |       <WarningLevel>Level3</WarningLevel>
 91 |       <Optimization>Disabled</Optimization>
 92 |       <SDLCheck>true</SDLCheck>
 93 |       <ConformanceMode>true</ConformanceMode>
 94 |       <AdditionalIncludeDirectories>src/microprofile</AdditionalIncludeDirectories>
 95 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 96 |       <FloatingPointModel>Fast</FloatingPointModel>
 97 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 98 |     </ClCompile>
 99 |     <Link>
100 |       <AdditionalDependencies>opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
101 |     </Link>
102 |   </ItemDefinitionGroup>
103 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
104 |     <ClCompile>
105 |       <WarningLevel>Level3</WarningLevel>
106 |       <Optimization>MaxSpeed</Optimization>
107 |       <FunctionLevelLinking>true</FunctionLevelLinking>
108 |       <IntrinsicFunctions>true</IntrinsicFunctions>
109 |       <ConformanceMode>true</ConformanceMode>
110 |       <AdditionalIncludeDirectories>src/microprofile</AdditionalIncludeDirectories>
111 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
112 |       <FloatingPointModel>Fast</FloatingPointModel>
113 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
114 |       <BufferSecurityCheck>false</BufferSecurityCheck>
115 |     </ClCompile>
116 |     <Link>
117 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
118 |       <OptimizeReferences>true</OptimizeReferences>
119 |       <AdditionalDependencies>opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
120 |     </Link>
121 |   </ItemDefinitionGroup>
122 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
123 |     <ClCompile>
124 |       <WarningLevel>Level3</WarningLevel>
125 |       <Optimization>MaxSpeed</Optimization>
126 |       <FunctionLevelLinking>true</FunctionLevelLinking>
127 |       <IntrinsicFunctions>true</IntrinsicFunctions>
128 |       <ConformanceMode>true</ConformanceMode>
129 |       <AdditionalIncludeDirectories>src/microprofile</AdditionalIncludeDirectories>
130 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;__SSE2__;__AVX2__;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
131 |       <FloatingPointModel>Fast</FloatingPointModel>
132 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
133 |       <BufferSecurityCheck>false</BufferSecurityCheck>
134 |     </ClCompile>
135 |     <Link>
136 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
137 |       <OptimizeReferences>true</OptimizeReferences>
138 |       <AdditionalDependencies>opengl32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
139 |     </Link>
140 |   </ItemDefinitionGroup>
141 |   <ItemGroup>
142 |     <ClInclude Include="src\AABB2.h" />
143 |     <ClInclude Include="src\base\AlignedArray.h" />
144 |     <ClInclude Include="src\base\DenseHash.h" />
145 |     <ClInclude Include="src\base\Parallel.h" />
146 |     <ClInclude Include="src\base\RadixSort.h" />
147 |     <ClInclude Include="src\base\SIMD.h" />
148 |     <ClInclude Include="src\base\SIMD_AVX2.h" />
149 |     <ClInclude Include="src\base\SIMD_AVX2_Transpose.h" />
150 |     <ClInclude Include="src\base\SIMD_Scalar.h" />
151 |     <ClInclude Include="src\base\SIMD_SSE2.h" />
152 |     <ClInclude Include="src\base\WorkQueue.h" />
153 |     <ClInclude Include="src\Collider.h" />
154 |     <ClInclude Include="src\Configuration.h" />
155 |     <ClInclude Include="src\Coords2.h" />
156 |     <ClInclude Include="src\Geom.h" />
157 |     <ClInclude Include="src\glad\glad.h" />
158 |     <ClInclude Include="src\Joints.h" />
159 |     <ClInclude Include="src\Manifold.h" />
160 |     <ClInclude Include="src\microprofile\microprofile.h" />
161 |     <ClInclude Include="src\microprofile\microprofiledraw.h" />
162 |     <ClInclude Include="src\microprofile\microprofilehtml.h" />
163 |     <ClInclude Include="src\microprofile\microprofileui.h" />
164 |     <ClInclude Include="src\RigidBody.h" />
165 |     <ClInclude Include="src\Solver.h" />
166 |     <ClInclude Include="src\Vector2.h" />
167 |     <ClInclude Include="src\World.h" />
168 |   </ItemGroup>
169 |   <ItemGroup>
170 |     <ClCompile Include="src\base\microprofile.cpp" />
171 |     <ClCompile Include="src\base\WorkQueue.cpp" />
172 |     <ClCompile Include="src\Collider.cpp" />
173 |     <ClCompile Include="src\glad\glad.c" />
174 |     <ClCompile Include="src\main.cpp" />
175 |     <ClCompile Include="src\Solver.cpp" />
176 |     <ClCompile Include="src\World.cpp" />
177 |   </ItemGroup>
178 |   <ItemGroup>
179 |     <None Include="packages.config" />
180 |   </ItemGroup>
181 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
182 |   <ImportGroup Label="ExtensionTargets">
183 |     <Import Project="packages\glfw.3.2.1.5\build\native\glfw.targets" Condition="Exists('packages\glfw.3.2.1.5\build\native\glfw.targets')" />
184 |   </ImportGroup>
185 |   <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
186 |     <PropertyGroup>
187 |       <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
188 |     </PropertyGroup>
189 |     <Error Condition="!Exists('packages\glfw.3.2.1.5\build\native\glfw.targets')" Text="$([System.String]::Format('$(ErrorText)', 'packages\glfw.3.2.1.5\build\native\glfw.targets'))" />
190 |   </Target>
191 | </Project>


--------------------------------------------------------------------------------
/phyx.vcxproj.filters:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup>
  4 |     <Filter Include="src">
  5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
  6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
  7 |     </Filter>
  8 |     <Filter Include="src\microprofile">
  9 |       <UniqueIdentifier>{cc5792bf-4514-426c-b977-98bb3782e085}</UniqueIdentifier>
 10 |     </Filter>
 11 |     <Filter Include="src\base">
 12 |       <UniqueIdentifier>{40ef955c-c1c2-4774-ad98-2e48043cd809}</UniqueIdentifier>
 13 |     </Filter>
 14 |     <Filter Include="src\glad">
 15 |       <UniqueIdentifier>{c55d19a8-61f0-42bf-a230-4f8179c9f303}</UniqueIdentifier>
 16 |     </Filter>
 17 |   </ItemGroup>
 18 |   <ItemGroup>
 19 |     <ClInclude Include="src\AABB2.h">
 20 |       <Filter>src</Filter>
 21 |     </ClInclude>
 22 |     <ClInclude Include="src\Collider.h">
 23 |       <Filter>src</Filter>
 24 |     </ClInclude>
 25 |     <ClInclude Include="src\Configuration.h">
 26 |       <Filter>src</Filter>
 27 |     </ClInclude>
 28 |     <ClInclude Include="src\Coords2.h">
 29 |       <Filter>src</Filter>
 30 |     </ClInclude>
 31 |     <ClInclude Include="src\Geom.h">
 32 |       <Filter>src</Filter>
 33 |     </ClInclude>
 34 |     <ClInclude Include="src\Joints.h">
 35 |       <Filter>src</Filter>
 36 |     </ClInclude>
 37 |     <ClInclude Include="src\Manifold.h">
 38 |       <Filter>src</Filter>
 39 |     </ClInclude>
 40 |     <ClInclude Include="src\RigidBody.h">
 41 |       <Filter>src</Filter>
 42 |     </ClInclude>
 43 |     <ClInclude Include="src\Solver.h">
 44 |       <Filter>src</Filter>
 45 |     </ClInclude>
 46 |     <ClInclude Include="src\Vector2.h">
 47 |       <Filter>src</Filter>
 48 |     </ClInclude>
 49 |     <ClInclude Include="src\World.h">
 50 |       <Filter>src</Filter>
 51 |     </ClInclude>
 52 |     <ClInclude Include="src\microprofile\microprofile.h">
 53 |       <Filter>src\microprofile</Filter>
 54 |     </ClInclude>
 55 |     <ClInclude Include="src\microprofile\microprofiledraw.h">
 56 |       <Filter>src\microprofile</Filter>
 57 |     </ClInclude>
 58 |     <ClInclude Include="src\microprofile\microprofilehtml.h">
 59 |       <Filter>src\microprofile</Filter>
 60 |     </ClInclude>
 61 |     <ClInclude Include="src\microprofile\microprofileui.h">
 62 |       <Filter>src\microprofile</Filter>
 63 |     </ClInclude>
 64 |     <ClInclude Include="src\base\AlignedArray.h">
 65 |       <Filter>src\base</Filter>
 66 |     </ClInclude>
 67 |     <ClInclude Include="src\base\DenseHash.h">
 68 |       <Filter>src\base</Filter>
 69 |     </ClInclude>
 70 |     <ClInclude Include="src\base\Parallel.h">
 71 |       <Filter>src\base</Filter>
 72 |     </ClInclude>
 73 |     <ClInclude Include="src\base\RadixSort.h">
 74 |       <Filter>src\base</Filter>
 75 |     </ClInclude>
 76 |     <ClInclude Include="src\base\SIMD.h">
 77 |       <Filter>src\base</Filter>
 78 |     </ClInclude>
 79 |     <ClInclude Include="src\base\SIMD_AVX2.h">
 80 |       <Filter>src\base</Filter>
 81 |     </ClInclude>
 82 |     <ClInclude Include="src\base\SIMD_AVX2_Transpose.h">
 83 |       <Filter>src\base</Filter>
 84 |     </ClInclude>
 85 |     <ClInclude Include="src\base\SIMD_Scalar.h">
 86 |       <Filter>src\base</Filter>
 87 |     </ClInclude>
 88 |     <ClInclude Include="src\base\SIMD_SSE2.h">
 89 |       <Filter>src\base</Filter>
 90 |     </ClInclude>
 91 |     <ClInclude Include="src\base\WorkQueue.h">
 92 |       <Filter>src\base</Filter>
 93 |     </ClInclude>
 94 |     <ClInclude Include="src\glad\glad.h">
 95 |       <Filter>src\glad</Filter>
 96 |     </ClInclude>
 97 |   </ItemGroup>
 98 |   <ItemGroup>
 99 |     <ClCompile Include="src\Collider.cpp">
100 |       <Filter>src</Filter>
101 |     </ClCompile>
102 |     <ClCompile Include="src\main.cpp">
103 |       <Filter>src</Filter>
104 |     </ClCompile>
105 |     <ClCompile Include="src\Solver.cpp">
106 |       <Filter>src</Filter>
107 |     </ClCompile>
108 |     <ClCompile Include="src\World.cpp">
109 |       <Filter>src</Filter>
110 |     </ClCompile>
111 |     <ClCompile Include="src\base\microprofile.cpp">
112 |       <Filter>src\base</Filter>
113 |     </ClCompile>
114 |     <ClCompile Include="src\base\WorkQueue.cpp">
115 |       <Filter>src\base</Filter>
116 |     </ClCompile>
117 |     <ClCompile Include="src\glad\glad.c">
118 |       <Filter>src\glad</Filter>
119 |     </ClCompile>
120 |   </ItemGroup>
121 |   <ItemGroup>
122 |     <None Include="packages.config" />
123 |   </ItemGroup>
124 | </Project>


--------------------------------------------------------------------------------
/src/AABB2.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | 
  5 | template <typename T>
  6 | class AABB2
  7 | {
  8 |   public:
  9 |     AABB2()
 10 |     {
 11 |         Reset();
 12 |     }
 13 |     AABB2(const Vector2<T>& _boxPoint1, const Vector2<T>& _boxPoint2)
 14 |     {
 15 |         Set(_boxPoint1, _boxPoint2);
 16 |     }
 17 | 
 18 |     bool Intersects(const AABB2<T>& aabb) const
 19 |     {
 20 |         if ((boxPoint1.x > aabb.boxPoint2.x) || (aabb.boxPoint1.x > boxPoint2.x)) return 0;
 21 |         if ((boxPoint1.y > aabb.boxPoint2.y) || (aabb.boxPoint1.y > boxPoint2.y)) return 0;
 22 |         return 1;
 23 |     }
 24 | 
 25 |     template <bool isFiniteCast>
 26 |     bool Intersects(Vector2<T> rayStart, Vector2<T> rayDir, float& paramMin, float& paramMax)
 27 |     {
 28 |         // r.dir is unit direction vector of ray
 29 |         Vector2f invDir(1.0f / rayDir.x, 1.0f / rayDir.y);
 30 | 
 31 |         // lb is the corner of AABB with minimal coordinates - left bottom, rt is maximal corner
 32 |         // r.org is origin of ray
 33 |         float t1 = (boxPoint1.x - rayStart.x) * invDir.x;
 34 |         float t2 = (boxPoint2.x - rayStart.x) * invDir.x;
 35 |         float t3 = (boxPoint1.y - rayStart.y) * invDir.y;
 36 |         float t4 = (boxPoint2.y - rayStart.y) * invDir.y;
 37 | 
 38 |         paramMin = std::max(std::min(t1, t2), std::min(t3, t4));
 39 |         paramMax = std::min(std::max(t1, t2), std::max(t3, t4));
 40 | 
 41 |         // if tmax < 0, ray (line) is intersecting AABB, but whole AABB is behing us
 42 |         if (isFiniteCast && paramMax < 0)
 43 |         {
 44 |             return false;
 45 |         }
 46 | 
 47 |         // if tmin > tmax, ray doesn't intersect AABB
 48 |         if (paramMin > paramMax)
 49 |         {
 50 |             return false;
 51 |         }
 52 |         return true;
 53 |     }
 54 |     bool Includes(const Vector2<T>& point) const
 55 |     {
 56 |         if ((point.x < boxPoint1.x) || (point.x > boxPoint2.x) ||
 57 |             (point.y < boxPoint1.y) || (point.y > boxPoint2.y))
 58 |         {
 59 |             return 0;
 60 |         }
 61 |         return 1;
 62 |     }
 63 |     bool Includes(const AABB2<T>& aabb) const
 64 |     {
 65 |         return Includes(aabb.boxPoint1) && Includes(aabb.boxPoint2);
 66 |     }
 67 |     void Set(const Vector2<T>& _boxPoint1, const Vector2<T>& _boxPoint2)
 68 |     {
 69 |         boxPoint1 = _boxPoint1;
 70 |         boxPoint2 = _boxPoint2;
 71 |     }
 72 |     void Reset()
 73 |     {
 74 |         boxPoint1 = Vector2<T>::zeroVector();
 75 |         boxPoint2 = Vector2<T>::zeroVector();
 76 |     }
 77 |     void Expand(const Vector2<T>& additionalPoint)
 78 |     {
 79 |         boxPoint1.x = Min(boxPoint1.x, additionalPoint.x);
 80 |         boxPoint1.y = Min(boxPoint1.y, additionalPoint.y);
 81 | 
 82 |         boxPoint2.x = Max(boxPoint2.x, additionalPoint.x);
 83 |         boxPoint2.y = Max(boxPoint2.y, additionalPoint.y);
 84 |     }
 85 |     void Expand(const AABB2<T>& internalAABB)
 86 |     {
 87 |         Expand(internalAABB.boxPoint1);
 88 |         Expand(internalAABB.boxPoint2);
 89 |     }
 90 |     void Expand(const T mult)
 91 |     {
 92 |         Vector2<T> size = boxPoint2 - boxPoint1;
 93 |         boxPoint1 -= size * (mult - T(1.0)) * T(0.5);
 94 |         boxPoint2 += size * (mult - T(1.0)) * T(0.5);
 95 |     }
 96 |     const T Square() //perimeter actually, used for AABBTree balancing
 97 |     {
 98 |         Vector2<T> size = boxPoint2 - boxPoint1;
 99 |         return (size.x + size.y) * T(2.0);
100 |     }
101 |     Vector2<T> GetSize()
102 |     {
103 |         return boxPoint2 - boxPoint1;
104 |     }
105 |     Vector2<T> boxPoint1, boxPoint2;
106 | };
107 | 
108 | typedef AABB2<float> AABB2f;
109 | typedef AABB2<double> AABB2d;
110 | 


--------------------------------------------------------------------------------
/src/Collider.cpp:
--------------------------------------------------------------------------------
  1 | #include "Collider.h"
  2 | 
  3 | #include "base/Parallel.h"
  4 | #include "base/RadixSort.h"
  5 | 
  6 | #include "microprofile.h"
  7 | 
  8 | static NOINLINE bool ComputeSeparatingAxis(RigidBody* body1, RigidBody* body2, Vector2f& separatingAxis)
  9 | {
 10 |     // http://www.geometrictools.com/Source/Intersection2D.html#PlanarPlanar
 11 |     // Adapted to return axis with least amount of penetration
 12 |     const Vector2f* A0 = &body1->coords.xVector;
 13 |     const Vector2f* A1 = &body2->coords.xVector;
 14 |     const Vector2f& E0 = body1->geom.size;
 15 |     const Vector2f& E1 = body2->geom.size;
 16 | 
 17 |     Vector2f D = body1->coords.pos - body2->coords.pos;
 18 | 
 19 |     float Adot[2][2];
 20 | 
 21 |     // Test axis box0.axis[0]
 22 |     Adot[0][0] = fabsf(A0[0] * A1[0]);
 23 |     Adot[0][1] = fabsf(A0[0] * A1[1]);
 24 | 
 25 |     float rSum0 = E0.x + E1.x * Adot[0][0] + E1.y * Adot[0][1];
 26 |     float dist0 = fabsf(A0[0] * D) - rSum0;
 27 |     if (dist0 > 0) return false;
 28 | 
 29 |     float bestdist = dist0;
 30 |     Vector2f bestaxis = A0[0];
 31 | 
 32 |     // Test axis box0.axis[1].
 33 |     Adot[1][0] = fabsf(A0[1] * A1[0]);
 34 |     Adot[1][1] = fabsf(A0[1] * A1[1]);
 35 | 
 36 |     float rSum1 = E0.y + E1.x * Adot[1][0] + E1.y * Adot[1][1];
 37 |     float dist1 = fabsf(A0[1] * D) - rSum1;
 38 |     if (dist1 > 0) return false;
 39 |     if (dist1 > bestdist) { bestdist = dist1; bestaxis = A0[1]; }
 40 | 
 41 |     // Test axis box1.axis[0].
 42 |     float rSum2 = E1.x + E0.x * Adot[0][0] + E0.y * Adot[1][0];
 43 |     float dist2 = fabsf(A1[0] * D) - rSum2;
 44 |     if (dist2 > 0) return false;
 45 |     if (dist2 > bestdist) { bestdist = dist2; bestaxis = A1[0]; }
 46 | 
 47 |     // Test axis box1.axis[1].
 48 |     float rSum3 = E1.y + E0.x * Adot[0][1] + E0.y * Adot[1][1];
 49 |     float dist3 = fabsf(A1[1] * D) - rSum3;
 50 |     if (dist3 > 0) return false;
 51 |     if (dist3 > bestdist) { bestdist = dist3; bestaxis = A1[1]; }
 52 | 
 53 |     separatingAxis = bestaxis;
 54 |     return true;
 55 | }
 56 | 
 57 | static void AddPoint(ContactPoint* points, int& pointCount, ContactPoint& newbie)
 58 | {
 59 |     ContactPoint* closest = 0;
 60 |     float bestdepth = std::numeric_limits<float>::max();
 61 | 
 62 |     for (int collisionIndex = 0; collisionIndex < pointCount; collisionIndex++)
 63 |     {
 64 |         ContactPoint& col = points[collisionIndex];
 65 | 
 66 |         if (newbie.Equals(col, 2.0f))
 67 |         {
 68 |             float depth = (newbie.delta1 - col.delta1).SquareLen() + (newbie.delta2 - col.delta2).SquareLen();
 69 |             if (depth < bestdepth)
 70 |             {
 71 |                 bestdepth = depth;
 72 |                 closest = &col;
 73 |             }
 74 |         }
 75 |     }
 76 | 
 77 |     if (closest)
 78 |     {
 79 |         closest->isMerged = 1;
 80 |         closest->isNewlyCreated = 0;
 81 |         closest->normal = newbie.normal;
 82 |         closest->delta1 = newbie.delta1;
 83 |         closest->delta2 = newbie.delta2;
 84 |     }
 85 |     else
 86 |     {
 87 |         assert(pointCount < 4);
 88 |         newbie.isMerged = 1;
 89 |         newbie.isNewlyCreated = 1;
 90 |         points[pointCount++] = newbie;
 91 |     }
 92 | }
 93 | 
 94 | static void NOINLINE GenerateContacts(RigidBody* body1, RigidBody* body2, ContactPoint* points, int& pointCount, Vector2f separatingAxis)
 95 | {
 96 |     if (separatingAxis * (body1->coords.pos - body2->coords.pos) < 0.0f)
 97 |         separatingAxis.Invert();
 98 | 
 99 |     const int kMaxSupportPoints = 2;
100 |     Vector2f supportPoints1[kMaxSupportPoints];
101 |     Vector2f supportPoints2[kMaxSupportPoints];
102 | 
103 |     float linearTolerance = 2.0f;
104 | 
105 |     int supportPointsCount1 = body1->geom.GetSupportPointSet(-separatingAxis, supportPoints1);
106 |     int supportPointsCount2 = body2->geom.GetSupportPointSet(separatingAxis, supportPoints2);
107 | 
108 |     if ((supportPointsCount1 == 2) && (((supportPoints1[0] - supportPoints1[1])).SquareLen() < linearTolerance * linearTolerance))
109 |     {
110 |         supportPoints1[0] = (supportPoints1[0] + supportPoints1[1]) * 0.5f;
111 |         supportPointsCount1 = 1;
112 |     }
113 |     if ((supportPointsCount2 == 2) && (((supportPoints2[0] - supportPoints2[1])).SquareLen() < linearTolerance * linearTolerance))
114 |     {
115 |         supportPoints2[0] = (supportPoints2[0] + supportPoints2[1]) * 0.5f;
116 |         supportPointsCount2 = 1;
117 |     }
118 | 
119 |     if ((supportPointsCount1 == 1) && (supportPointsCount2 == 1))
120 |     {
121 |         Vector2f delta = supportPoints2[0] - supportPoints1[0];
122 |         //float eps = (delta ^ separatingAxis).SquareLen();
123 |         if (delta * separatingAxis >= 0.0f)
124 |         {
125 |             ContactPoint newbie(supportPoints1[0], supportPoints2[0], separatingAxis, body1, body2);
126 |             AddPoint(points, pointCount, newbie);
127 |         }
128 |     }
129 |     else if ((supportPointsCount1 == 1) && (supportPointsCount2 == 2))
130 |     {
131 |         Vector2f n = (supportPoints2[1] - supportPoints2[0]).GetPerpendicular();
132 |         Vector2f point;
133 |         ProjectPointToLine(supportPoints1[0], supportPoints2[0], n, separatingAxis, point);
134 | 
135 |         if ((((point - supportPoints2[0]) * (supportPoints2[1] - supportPoints2[0])) >= 0.0f) &&
136 |             (((point - supportPoints2[1]) * (supportPoints2[0] - supportPoints2[1])) >= 0.0f))
137 |         {
138 |             ContactPoint newbie(supportPoints1[0], point, separatingAxis, body1, body2);
139 |             AddPoint(points, pointCount, newbie);
140 |         }
141 |     }
142 |     else if ((supportPointsCount1 == 2) && (supportPointsCount2 == 1))
143 |     {
144 |         Vector2f n = (supportPoints1[1] - supportPoints1[0]).GetPerpendicular();
145 |         Vector2f point;
146 |         ProjectPointToLine(supportPoints2[0], supportPoints1[0], n, separatingAxis, point);
147 | 
148 |         if ((((point - supportPoints1[0]) * (supportPoints1[1] - supportPoints1[0])) >= 0.0f) &&
149 |             (((point - supportPoints1[1]) * (supportPoints1[0] - supportPoints1[1])) >= 0.0f))
150 |         {
151 |             ContactPoint newbie(point, supportPoints2[0], separatingAxis, body1, body2);
152 |             AddPoint(points, pointCount, newbie);
153 |         }
154 |     }
155 |     else if ((supportPointsCount2 == 2) && (supportPointsCount1 == 2))
156 |     {
157 |         struct TempColInfo
158 |         {
159 |             Vector2f point1, point2;
160 |         };
161 |         TempColInfo tempCol[4];
162 |         int tempCols = 0;
163 |         for (int i = 0; i < 2; i++)
164 |         {
165 |             Vector2f n = (supportPoints2[1] - supportPoints2[0]).GetPerpendicular();
166 |             if ((supportPoints1[i] - supportPoints2[0]) * n >= 0.0)
167 |             {
168 |                 Vector2f point;
169 |                 ProjectPointToLine(supportPoints1[i], supportPoints2[0], n, separatingAxis, point);
170 | 
171 |                 if ((((point - supportPoints2[0]) * (supportPoints2[1] - supportPoints2[0])) >= 0.0f) &&
172 |                     (((point - supportPoints2[1]) * (supportPoints2[0] - supportPoints2[1])) >= 0.0f))
173 |                 {
174 |                     tempCol[tempCols].point1 = supportPoints1[i];
175 |                     tempCol[tempCols].point2 = point;
176 |                     tempCols++;
177 |                 }
178 |             }
179 |         }
180 |         for (int i = 0; i < 2; i++)
181 |         {
182 |             Vector2f n = (supportPoints1[1] - supportPoints1[0]).GetPerpendicular();
183 |             if ((supportPoints2[i] - supportPoints1[0]) * n >= 0.0)
184 |             {
185 |                 Vector2f point;
186 |                 ProjectPointToLine(supportPoints2[i], supportPoints1[0], n, separatingAxis, point);
187 | 
188 |                 if ((((point - supportPoints1[0]) * (supportPoints1[1] - supportPoints1[0])) >= 0.0f) &&
189 |                     (((point - supportPoints1[1]) * (supportPoints1[0] - supportPoints1[1])) >= 0.0f))
190 |                 {
191 |                     tempCol[tempCols].point1 = point;
192 |                     tempCol[tempCols].point2 = supportPoints2[i];
193 |                     tempCols++;
194 |                 }
195 |             }
196 |         }
197 | 
198 |         if (tempCols == 1) //buggy but must work
199 |         {
200 |             ContactPoint newbie(tempCol[0].point1, tempCol[0].point2, separatingAxis, body1, body2);
201 |             AddPoint(points, pointCount, newbie);
202 |         }
203 |         if (tempCols >= 2) //means only equality, but clamp to two points
204 |         {
205 |             ContactPoint newbie1(tempCol[0].point1, tempCol[0].point2, separatingAxis, body1, body2);
206 |             AddPoint(points, pointCount, newbie1);
207 |             ContactPoint newbie2(tempCol[1].point1, tempCol[1].point2, separatingAxis, body1, body2);
208 |             AddPoint(points, pointCount, newbie2);
209 |         }
210 |     }
211 | }
212 | 
213 | static void UpdateManifold(Manifold& m, RigidBody* bodies, ContactPoint* points)
214 | {
215 |     ContactPoint newpoints[kMaxContactPoints * 2];
216 | 
217 |     for (int collisionIndex = 0; collisionIndex < m.pointCount; collisionIndex++)
218 |     {
219 |         newpoints[collisionIndex] = points[collisionIndex];
220 |         newpoints[collisionIndex].isMerged = 0;
221 |         newpoints[collisionIndex].isNewlyCreated = 0;
222 |     }
223 | 
224 |     int newPointCount = m.pointCount;
225 | 
226 |     RigidBody* body1 = &bodies[m.body1Index];
227 |     RigidBody* body2 = &bodies[m.body2Index];
228 | 
229 |     Vector2f separatingAxis;
230 |     if (ComputeSeparatingAxis(body1, body2, separatingAxis))
231 |     {
232 |         GenerateContacts(body1, body2, newpoints, newPointCount, separatingAxis);
233 |     }
234 | 
235 |     m.pointCount = 0;
236 | 
237 |     for (int collisionIndex = 0; collisionIndex < newPointCount; ++collisionIndex)
238 |     {
239 |         if (newpoints[collisionIndex].isMerged)
240 |         {
241 |             assert(m.pointCount < kMaxContactPoints);
242 |             points[m.pointCount++] = newpoints[collisionIndex];
243 |         }
244 |     }
245 | }
246 | 
247 | Collider::Collider()
248 | {
249 | }
250 | 
251 | NOINLINE void Collider::UpdateBroadphase(RigidBody* bodies, size_t bodiesCount)
252 | {
253 |     MICROPROFILE_SCOPEI("Physics", "UpdateBroadphase", -1);
254 | 
255 |     broadphase.resize(bodiesCount);
256 |     broadphaseSort[0].resize(bodiesCount);
257 |     broadphaseSort[1].resize(bodiesCount);
258 | 
259 |     for (size_t bodyIndex = 0; bodyIndex < bodiesCount; ++bodyIndex)
260 |     {
261 |         const AABB2f& aabb = bodies[bodyIndex].geom.aabb;
262 | 
263 |         broadphaseSort[0][bodyIndex].value = radixFloat(aabb.boxPoint1.x);
264 |         broadphaseSort[0][bodyIndex].index = bodyIndex;
265 |     }
266 | 
267 |     radixSort3(broadphaseSort[0].data, broadphaseSort[1].data, bodiesCount, [](const BroadphaseSortEntry& e) { return e.value; });
268 | 
269 |     for (size_t i = 0; i < bodiesCount; ++i)
270 |     {
271 |         unsigned int bodyIndex = broadphaseSort[1][i].index;
272 | 
273 |         const AABB2f& aabb = bodies[bodyIndex].geom.aabb;
274 | 
275 |         BroadphaseEntry e =
276 |             {
277 |              aabb.boxPoint1.x, aabb.boxPoint2.x,
278 |              (aabb.boxPoint1.y + aabb.boxPoint2.y) * 0.5f,
279 |              (aabb.boxPoint2.y - aabb.boxPoint1.y) * 0.5f,
280 |              unsigned(bodyIndex)};
281 | 
282 |         broadphase[i] = e;
283 |     }
284 | }
285 | 
286 | NOINLINE void Collider::UpdatePairs(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount)
287 | {
288 |     assert(bodiesCount == broadphase.size);
289 | 
290 |     if (queue.getWorkerCount() == 0)
291 |         UpdatePairsSerial(bodies, bodiesCount);
292 |     else
293 |         UpdatePairsParallel(queue, bodies, bodiesCount);
294 | }
295 | 
296 | NOINLINE void Collider::UpdatePairsSerial(RigidBody* bodies, size_t bodiesCount)
297 | {
298 |     MICROPROFILE_SCOPEI("Physics", "UpdatePairsSerial", -1);
299 | 
300 |     for (size_t bodyIndex1 = 0; bodyIndex1 < bodiesCount; bodyIndex1++)
301 |     {
302 |         const BroadphaseEntry& be1 = broadphase[bodyIndex1];
303 |         float maxx = be1.maxx;
304 | 
305 |         for (size_t bodyIndex2 = bodyIndex1 + 1; bodyIndex2 < bodiesCount; bodyIndex2++)
306 |         {
307 |             const BroadphaseEntry& be2 = broadphase[bodyIndex2];
308 |             if (be2.minx > maxx)
309 |                 break;
310 | 
311 |             if (fabsf(be2.centery - be1.centery) <= be1.extenty + be2.extenty)
312 |             {
313 |                 if (manifoldMap.insert(std::make_pair(be1.index, be2.index)))
314 |                 {
315 |                     manifolds.push_back(Manifold(be1.index, be2.index, manifolds.size * kMaxContactPoints));
316 |                 }
317 |             }
318 |         }
319 |     }
320 | }
321 | 
322 | NOINLINE void Collider::UpdatePairsParallel(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount)
323 | {
324 |     MICROPROFILE_SCOPEI("Physics", "UpdatePairsParallel", -1);
325 | 
326 |     manifoldBuffers.resize(queue.getWorkerCount() + 1);
327 | 
328 |     for (auto& buf: manifoldBuffers)
329 |         buf.pairs.clear();
330 | 
331 |     parallelFor(queue, 0, bodiesCount, 128, [this, bodies, bodiesCount](int bodyIndex1, int worker) {
332 |         UpdatePairsOne(bodies, bodyIndex1, bodyIndex1 + 1, bodiesCount, manifoldBuffers[worker]);
333 |     });
334 | 
335 |     MICROPROFILE_SCOPEI("Physics", "CreateManifolds", -1);
336 | 
337 |     for (auto& buf : manifoldBuffers)
338 |     {
339 |         for (auto& pair : buf.pairs)
340 |         {
341 |             manifoldMap.insert(pair);
342 |             manifolds.push_back(Manifold(pair.first, pair.second, manifolds.size * kMaxContactPoints));
343 |         }
344 |     }
345 | }
346 | 
347 | void Collider::UpdatePairsOne(RigidBody* bodies, size_t bodyIndex1, size_t startIndex, size_t endIndex, ManifoldDeferredBuffer& buffer)
348 | {
349 |     const BroadphaseEntry& be1 = broadphase[bodyIndex1];
350 |     float maxx = be1.maxx;
351 | 
352 |     for (size_t bodyIndex2 = startIndex; bodyIndex2 < endIndex; bodyIndex2++)
353 |     {
354 |         const BroadphaseEntry& be2 = broadphase[bodyIndex2];
355 |         if (be2.minx > maxx)
356 |             return;
357 | 
358 |         if (fabsf(be2.centery - be1.centery) <= be1.extenty + be2.extenty)
359 |         {
360 |             if (!manifoldMap.contains(std::make_pair(be1.index, be2.index)))
361 |             {
362 |                 buffer.pairs.push_back(std::make_pair(be1.index, be2.index));
363 |             }
364 |         }
365 |     }
366 | }
367 | 
368 | NOINLINE void Collider::UpdateManifolds(WorkQueue& queue, RigidBody* bodies)
369 | {
370 |     MICROPROFILE_SCOPEI("Physics", "UpdateManifolds", -1);
371 | 
372 |     contactPoints.resize_copy(manifolds.size * kMaxContactPoints);
373 | 
374 |     parallelFor(queue, manifolds.data, manifolds.size, 16, [&](Manifold& m, int) {
375 |         UpdateManifold(m, bodies, contactPoints.data + m.pointIndex);
376 |     });
377 | }
378 | 
379 | NOINLINE void Collider::PackManifolds(RigidBody* bodies)
380 | {
381 |     MICROPROFILE_SCOPEI("Physics", "PackManifolds", -1);
382 | 
383 |     for (int manifoldIndex = 0; manifoldIndex < manifolds.size;)
384 |     {
385 |         Manifold& m = manifolds[manifoldIndex];
386 | 
387 |         // TODO
388 |         // This reduces broadphase insert/erase operations, which is good
389 |         // However, current behavior causes issues with DenseHash - is it possible to improve it?
390 |         if (m.pointCount == 0 && !bodies[m.body1Index].geom.aabb.Intersects(bodies[m.body2Index].geom.aabb))
391 |         {
392 |             manifoldMap.erase(std::make_pair(m.body1Index, m.body2Index));
393 | 
394 |             if (manifoldIndex < manifolds.size)
395 |             {
396 |                 Manifold& me = manifolds[manifolds.size - 1];
397 | 
398 |                 unsigned int pointIndex = m.pointIndex;
399 | 
400 |                 for (int i = 0; i < me.pointCount; ++i)
401 |                     contactPoints[pointIndex + i] = contactPoints[me.pointIndex + i];
402 | 
403 |                 m = me;
404 |                 m.pointIndex = pointIndex;
405 |             }
406 | 
407 |             manifolds.size--;
408 |         }
409 |         else
410 |         {
411 |             manifoldIndex++;
412 |         }
413 |     }
414 | 
415 |     contactPoints.truncate(manifolds.size * kMaxContactPoints);
416 | }


--------------------------------------------------------------------------------
/src/Collider.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Manifold.h"
 4 | #include "base/DenseHash.h"
 5 | #include "base/AlignedArray.h"
 6 | 
 7 | namespace std
 8 | {
 9 |     template <>
10 |     struct hash<std::pair<unsigned int, unsigned int>>
11 |     {
12 |         size_t operator()(const std::pair<unsigned int, unsigned int>& p) const
13 |         {
14 |             unsigned int lb = p.first;
15 |             unsigned int rb = p.second;
16 | 
17 |             return lb ^ (rb + 0x9e3779b9 + (lb << 6) + (lb >> 2));
18 |         }
19 |     };
20 | }
21 | 
22 | class WorkQueue;
23 | 
24 | struct Collider
25 | {
26 |     Collider();
27 | 
28 |     void UpdateBroadphase(RigidBody* bodies, size_t bodiesCount);
29 |     void UpdatePairs(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount);
30 |     void UpdatePairsSerial(RigidBody* bodies, size_t bodiesCount);
31 |     void UpdatePairsParallel(WorkQueue& queue, RigidBody* bodies, size_t bodiesCount);
32 | 
33 |     struct ManifoldDeferredBuffer;
34 | 
35 |     void UpdatePairsOne(RigidBody* bodies, size_t bodyIndex1, size_t startIndex, size_t endIndex, ManifoldDeferredBuffer& buffer);
36 | 
37 |     void UpdateManifolds(WorkQueue& queue, RigidBody* bodies);
38 |     void PackManifolds(RigidBody* bodies);
39 | 
40 |     struct ManifoldDeferredBuffer
41 |     {
42 |         AlignedArray<std::pair<int, int>> pairs;
43 |     };
44 | 
45 |     struct BroadphaseEntry
46 |     {
47 |         float minx, maxx;
48 |         float centery, extenty;
49 |         unsigned int index;
50 |     };
51 | 
52 |     struct BroadphaseSortEntry
53 |     {
54 |         unsigned int value;
55 |         unsigned int index;
56 |     };
57 | 
58 |     DenseHashSet<std::pair<unsigned int, unsigned int>> manifoldMap;
59 | 
60 |     AlignedArray<Manifold> manifolds;
61 |     AlignedArray<ContactPoint> contactPoints;
62 | 
63 |     std::vector<ManifoldDeferredBuffer> manifoldBuffers;
64 | 
65 |     AlignedArray<BroadphaseEntry> broadphase;
66 |     AlignedArray<BroadphaseSortEntry> broadphaseSort[2];
67 | };


--------------------------------------------------------------------------------
/src/Configuration.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | struct Configuration
 4 | {
 5 |     enum SolveMode
 6 |     {
 7 |         Solve_Scalar,
 8 |         Solve_SSE2,
 9 |         Solve_AVX2,
10 |     };
11 | 
12 |     enum IslandMode
13 |     {
14 |     	Island_Single,
15 |     	Island_Multiple,
16 |     	Island_SingleSloppy,
17 |     	Island_MultipleSloppy
18 |     };
19 | 
20 |     SolveMode solveMode;
21 |     IslandMode islandMode;
22 |     int contactIterationsCount;
23 |     int penetrationIterationsCount;
24 | };


--------------------------------------------------------------------------------
/src/Coords2.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <typename T>
 4 | class Coords2
 5 | {
 6 |   public:
 7 |     Vector2<T> xVector, yVector;
 8 |     Vector2<T> pos;
 9 |     Coords2() {}
10 |     Coords2(const Vector2<T>& _pos, const T angle)
11 |     {
12 |         float pi = 3.141592f;
13 |         xVector = Vector2<T>(cos(angle), sin(angle));
14 |         yVector = Vector2<T>(cos(angle + T(pi) / T(2.0)), sin(angle + T(pi) / T(2.0)));
15 | 
16 |         pos = _pos;
17 |     }
18 | 
19 |     const Vector2<T> GetPointRelativePos(const Vector2<T>& globalPoint) const
20 |     {
21 |         Vector2<T> delta = globalPoint - pos;
22 |         return Vector2<T>(delta * xVector,
23 |                           delta * yVector);
24 |     }
25 | 
26 |     const Vector2<T> GetAxisRelativeOrientation(const Vector2<T>& globalAxis) const
27 |     {
28 |         Vector2<T> delta = globalAxis;
29 |         return Vector2<T>(delta * xVector,
30 |                           delta * yVector);
31 |     }
32 | 
33 |     const Vector2<T> GetPointGlobalPos(const Vector2<T>& relativePoint) const
34 |     {
35 |         return pos + xVector * relativePoint.x + yVector * relativePoint.y;
36 |     }
37 | 
38 |     const Vector2<T> GetAxisGlobalOrientation(const Vector2<T>& relativeAxis) const
39 |     {
40 |         return xVector * relativeAxis.x + yVector * relativeAxis.y;
41 |     }
42 | 
43 |     const Coords2<T> GetGlobalCoords(const Coords2<T>& localCoords)
44 |     {
45 |         Coords2<T> res;
46 |         res.pos = GetPointGlobalPos(localCoords.pos);
47 |         res.xVector = GetAxisGlobalOrientation(localCoords.xVector);
48 |         res.yVector = GetAxisGlobalOrientation(localCoords.yVector);
49 |         return res;
50 |     }
51 | 
52 |     const Coords2<T> GetLocalCoords(const Coords2<T>& globalCoords)
53 |     {
54 |         Coords2<T> res;
55 |         res.pos = GetPointRelativePos(globalCoords.pos);
56 |         res.xVector = GetAxisRelativeOrientation(globalCoords.xVector);
57 |         res.yVector = GetAxisRelativeOrientation(globalCoords.yVector);
58 |         return res;
59 |     }
60 | 
61 |     void Identity()
62 |     {
63 |         xVector = Vector2<T>(1.0f, 0.0f);
64 |         yVector = Vector2<T>(0.0f, 1.0f);
65 |         pos = Vector2<T>::zeroVector();
66 |     }
67 | 
68 |     void SetRotation(const T& angle)
69 |     {
70 |         float pi = 3.141592f;
71 |         xVector = Vector2<T>(cos(angle), sin(angle));
72 |         yVector = Vector2<T>(cos(angle + pi / 2.0), sin(angle + pi / 2.0));
73 |     }
74 | 
75 |     void Rotate(const T& angle)
76 |     {
77 |         this->xVector.Rotate(angle);
78 |         this->yVector.Rotate(angle);
79 |     }
80 | 
81 |     static const Coords2<T> defCoords()
82 |     {
83 |         Coords2<T> coords;
84 |         coords.pos = Vector2<T>::zeroVector();
85 |         coords.xVector = Vector2<T>::xAxis();
86 |         coords.yVector = Vector2<T>::yAxis();
87 |         return coords;
88 |     }
89 | };
90 | 
91 | typedef Coords2<float> Coords2f;
92 | typedef Coords2<double> Coords2d;
93 | 
94 | typedef Coords2f Coords2f;
95 | typedef Coords2d Coords2d;
96 | 


--------------------------------------------------------------------------------
/src/Geom.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Vector2.h"
 3 | #include "Coords2.h"
 4 | #include "AABB2.h"
 5 | #include <cmath>
 6 | 
 7 | struct RigidBody;
 8 | struct Geom
 9 | {
10 |     Vector2f GetClippingVertex(const Vector2f& axis) const
11 |     {
12 |         Vector2f xdim = coords.xVector * size.x;
13 |         Vector2f ydim = coords.yVector * size.y;
14 | 
15 |         float xsgn = coords.xVector * axis < 0.0f ? -1.0f : 1.0f;
16 |         float ysgn = coords.yVector * axis < 0.0f ? -1.0f : 1.0f;
17 | 
18 |         return coords.pos + xsgn * xdim + ysgn * ydim;
19 |     }
20 | 
21 |     bool GetClippingEdge(const Vector2f& axis, Vector2f& edgepoint1, Vector2f& edgepoint2) const
22 |     {
23 |         edgepoint1 = coords.pos;
24 |         edgepoint2 = coords.pos;
25 |         Vector2f xdim = coords.xVector * size.x;
26 |         Vector2f ydim = coords.yVector * size.y;
27 |         Vector2f offset = Vector2f::zero();
28 |         float xdiff = axis * coords.xVector;
29 |         float ydiff = axis * coords.yVector;
30 | 
31 |         if (fabsf(xdiff) < fabsf(ydiff))
32 |         {
33 |             if (axis * ydim > 0.0f)
34 |             {
35 |                 offset += ydim;
36 |                 edgepoint1 += xdim;
37 |                 edgepoint2 -= xdim;
38 |             }
39 |             else
40 |             {
41 |                 offset -= ydim;
42 |                 edgepoint1 -= xdim;
43 |                 edgepoint2 += xdim;
44 |             }
45 |         }
46 |         else
47 |         {
48 |             if (axis * xdim > 0.0f)
49 |             {
50 |                 offset += xdim;
51 |                 edgepoint1 -= ydim;
52 |                 edgepoint2 += ydim;
53 |             }
54 |             else
55 |             {
56 |                 offset -= xdim;
57 |                 edgepoint1 += ydim;
58 |                 edgepoint2 -= ydim;
59 |             }
60 |         }
61 |         edgepoint1 += offset;
62 |         edgepoint2 += offset;
63 |         return 1;
64 |     }
65 | 
66 |     int GetSupportPointSet(const Vector2f& axis, Vector2f* supportPoints)
67 |     {
68 |         if ((fabsf(axis * coords.xVector) < 0.1f) ||
69 |             (fabsf(axis * coords.yVector) < 0.1f))
70 |         {
71 |             GetClippingEdge(axis, supportPoints[0], supportPoints[1]);
72 |             return 2;
73 |         }
74 | 
75 |         supportPoints[0] = GetClippingVertex(axis);
76 |         return 1;
77 |     }
78 | 
79 |     void RecomputeAABB()
80 |     {
81 |         Vector2f diff = Vector2f(
82 |             fabsf(coords.xVector.x) * size.x + fabsf(coords.yVector.x) * size.y,
83 |             fabsf(coords.xVector.y) * size.x + fabsf(coords.yVector.y) * size.y);
84 |         aabb.Set(coords.pos - diff, coords.pos + diff);
85 |     }
86 | 
87 |     Vector2f size;
88 |     Coords2f coords;
89 |     AABB2f aabb;
90 | };


--------------------------------------------------------------------------------
/src/Joints.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "RigidBody.h"
 4 | #include "Manifold.h"
 5 | 
 6 | struct ContactJoint
 7 | {
 8 |     ContactJoint(int body1Index, int body2Index, int collisionIndex)
 9 |     {
10 |         this->contactPointIndex = collisionIndex;
11 |         this->body1Index = body1Index;
12 |         this->body2Index = body2Index;
13 | 
14 |         normalLimiter_accumulatedImpulse = 0.f;
15 |         frictionLimiter_accumulatedImpulse = 0.f;
16 |     }
17 | 
18 |     int contactPointIndex;
19 |     int body1Index;
20 |     int body2Index;
21 |     float normalLimiter_accumulatedImpulse;
22 |     float frictionLimiter_accumulatedImpulse;
23 | };


--------------------------------------------------------------------------------
/src/Manifold.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Vector2.h"
 4 | #include "Geom.h"
 5 | #include "RigidBody.h"
 6 | 
 7 | #include <limits>
 8 | #include <cassert>
 9 | 
10 | static const int kMaxContactPoints = 2;
11 | 
12 | struct ContactPoint
13 | {
14 |     ContactPoint()
15 |     {
16 |     }
17 | 
18 |     ContactPoint(Vector2f point1, const Vector2f& point2, const Vector2f normal, RigidBody* body1, RigidBody* body2)
19 |     {
20 |         this->delta1 = point1 - body1->coords.pos;
21 |         this->delta2 = point2 - body2->coords.pos;
22 |         this->normal = normal;
23 |         isMerged = 0;
24 |         isNewlyCreated = 1;
25 |         solverIndex = -1;
26 |     }
27 | 
28 |     bool Equals(const ContactPoint& other, float tolerance) const
29 |     {
30 |         if (((other.delta1 - delta1).SquareLen() > tolerance * tolerance) &&
31 |             ((other.delta2 - delta2).SquareLen() > tolerance * tolerance))
32 |         {
33 |             return 0;
34 |         }
35 |         return 1;
36 |     }
37 | 
38 |     Vector2f delta1, delta2;
39 |     Vector2f normal;
40 |     bool isMerged;
41 |     bool isNewlyCreated;
42 |     int solverIndex;
43 | };
44 | 
45 | struct Manifold
46 | {
47 |     Manifold()
48 |     {
49 |         body1Index = -1;
50 |         body2Index = -1;
51 |         pointCount = 0;
52 |         pointIndex = 0;
53 |     }
54 | 
55 |     Manifold(int body1Index, int body2Index, int pointIndex)
56 |     {
57 |         this->body1Index = body1Index;
58 |         this->body2Index = body2Index;
59 |         this->pointCount = 0;
60 |         this->pointIndex = pointIndex;
61 |     }
62 | 
63 |     int body1Index;
64 |     int body2Index;
65 | 
66 |     int pointCount;
67 |     int pointIndex;
68 | };


--------------------------------------------------------------------------------
/src/RigidBody.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Vector2.h"
 3 | #include "Coords2.h"
 4 | #include "Geom.h"
 5 | 
 6 | #ifdef _MSC_VER
 7 | #define NOINLINE __declspec(noinline)
 8 | #else
 9 | #define NOINLINE __attribute__((noinline))
10 | #endif
11 | 
12 | struct RigidBody
13 | {
14 |     RigidBody() {}
15 |     RigidBody(Coords2f coords, Vector2f size, float density)
16 |     {
17 |         this->coords = coords;
18 |         displacingVelocity = Vector2f(0.0f, 0.0f);
19 |         displacingAngularVelocity = 0.0f;
20 | 
21 |         acceleration = Vector2f(0.0f, 0.0f);
22 |         angularAcceleration = 0.0f;
23 | 
24 |         velocity = Vector2f(0.0f, 0.0f);
25 |         angularVelocity = 0.0f;
26 | 
27 |         geom.size = size;
28 | 
29 |         float mass = density * (size.x * size.y);
30 |         float inertia = mass * (size.x * size.x + size.y * size.y);
31 | 
32 |         invMass = 1.0f / mass;
33 |         invInertia = 1.0f / inertia;
34 | 
35 |         UpdateGeom();
36 |     }
37 | 
38 |     void UpdateGeom()
39 |     {
40 |         geom.coords = coords;
41 |         geom.RecomputeAABB();
42 |     }
43 | 
44 |     unsigned int index;
45 | 
46 |     Geom geom;
47 | 
48 |     Vector2f velocity, acceleration;
49 |     Vector2f displacingVelocity;
50 |     float angularVelocity, angularAcceleration;
51 |     float displacingAngularVelocity;
52 | 
53 |     float invMass, invInertia;
54 |     Coords2f coords;
55 | 
56 |     int lastIteration;
57 |     int lastDisplacementIteration;
58 | };


--------------------------------------------------------------------------------
/src/Solver.cpp:
--------------------------------------------------------------------------------
   1 | #include "Solver.h"
   2 | 
   3 | #include "base/Parallel.h"
   4 | #include "base/SIMD.h"
   5 | 
   6 | #include "Configuration.h"
   7 | 
   8 | const float kProductiveImpulse = 1e-4f;
   9 | const float kFrictionCoefficient = 0.3f;
  10 | 
  11 | const int kIslandMinSize = 256;
  12 | 
  13 | Solver::Solver()
  14 | {
  15 | }
  16 | 
  17 | NOINLINE void Solver::SolveJoints(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration)
  18 | {
  19 |     switch (configuration.solveMode)
  20 |     {
  21 |     case Configuration::Solve_AVX2:
  22 |     #ifdef __AVX2__
  23 |         SolveJoints_AVX2(queue, bodies, bodiesCount, contactPoints, configuration);
  24 |         break;
  25 |     #endif
  26 | 
  27 |     case Configuration::Solve_SSE2:
  28 |     #ifdef __SSE2__
  29 |         SolveJoints_SSE2(queue, bodies, bodiesCount, contactPoints, configuration);
  30 |         break;
  31 |     #endif
  32 | 
  33 |     case Configuration::Solve_Scalar:
  34 |         SolveJoints_Scalar(queue, bodies, bodiesCount, contactPoints, configuration);
  35 |         break;
  36 | 
  37 |     default:
  38 |         assert(!"Unknown solver mode");
  39 |         break;
  40 |     }
  41 | }
  42 | 
  43 | NOINLINE void Solver::SolveJoints_Scalar(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration)
  44 | {
  45 |     MICROPROFILE_SCOPEI("Physics", "SolveJoints_Scalar", -1);
  46 | 
  47 |     SolveJoints(queue, joint_packed1, bodies, bodiesCount, contactPoints, configuration);
  48 | }
  49 | 
  50 | #ifdef __SSE2__
  51 | NOINLINE void Solver::SolveJoints_SSE2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration)
  52 | {
  53 |     MICROPROFILE_SCOPEI("Physics", "SolveJoints_SSE2", -1);
  54 | 
  55 |     SolveJoints(queue, joint_packed4, bodies, bodiesCount, contactPoints, configuration);
  56 | }
  57 | #endif
  58 | 
  59 | #ifdef __AVX2__
  60 | NOINLINE void Solver::SolveJoints_AVX2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration)
  61 | {
  62 |     MICROPROFILE_SCOPEI("Physics", "SolveJoints_AVX2", -1);
  63 | 
  64 |     SolveJoints(queue, joint_packed8, bodies, bodiesCount, contactPoints, configuration);
  65 | }
  66 | #endif
  67 | 
  68 | template <int N>
  69 | void Solver::SolveJoints(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration)
  70 | {
  71 |     PrepareBodies(bodies, bodiesCount);
  72 | 
  73 |     bool splitIslands = (configuration.islandMode == Configuration::Island_Multiple || configuration.islandMode == Configuration::Island_MultipleSloppy);
  74 | 
  75 |     if (splitIslands)
  76 |     {
  77 |         int jointCountAligned = GatherIslands(bodies, bodiesCount, N);
  78 | 
  79 |         joint_packed.resize(jointCountAligned);
  80 |         jointGroup_joints.resize(jointCountAligned);
  81 |         jointGroup_bodies.resize(bodiesCount);
  82 | 
  83 |         for (int i = 0; i < bodiesCount; ++i)
  84 |             jointGroup_bodies[i] = 0;
  85 | 
  86 |         parallelFor(queue, 0, islandCount, 1, [&](int islandIndex, int) {
  87 |             int jointsBegin = island_offset[islandIndex];
  88 |             int jointsEnd = jointsBegin + island_size[islandIndex];
  89 | 
  90 |             SolveJointIsland(queue, joint_packed, jointsBegin, jointsEnd, contactPoints, configuration);
  91 |         });
  92 |     }
  93 |     else
  94 |     {
  95 |         int jointCount = contactJoints.size;
  96 | 
  97 |         joint_index.resize(jointCount);
  98 |         joint_packed.resize(jointCount);
  99 |         jointGroup_joints.resize(jointCount);
 100 |         jointGroup_bodies.resize(bodiesCount);
 101 | 
 102 |         for (int i = 0; i < jointCount; ++i)
 103 |             joint_index[i] = i;
 104 | 
 105 |         for (int i = 0; i < bodiesCount; ++i)
 106 |             jointGroup_bodies[i] = 0;
 107 | 
 108 |         islandCount = 1;
 109 |         islandMaxSize = jointCount;
 110 | 
 111 |         SolveJointIsland(queue, joint_packed, 0, jointCount, contactPoints, configuration);
 112 |     }
 113 | 
 114 |     FinishBodies(bodies, bodiesCount);
 115 | 
 116 |     MICROPROFILE_COUNTER_SET("physics/islands", islandCount);
 117 |     MICROPROFILE_COUNTER_SET("physics/bodies", bodiesCount);
 118 |     MICROPROFILE_COUNTER_SET("physics/joints", contactJoints.size);
 119 | }
 120 | 
 121 | static bool any(const AlignedArray<bool>& v)
 122 | {
 123 |     for (int i = 0; i < v.size; ++i)
 124 |         if (v[i])
 125 |             return true;
 126 | 
 127 |     return false;
 128 | }
 129 | 
 130 | template <int N>
 131 | NOINLINE void Solver::SolveJointIsland(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints, const Configuration& configuration)
 132 | {
 133 |     MICROPROFILE_SCOPEI("Physics", "SolveJointIsland", -1);
 134 | 
 135 |     int groupOffset = PrepareJoints(queue, joint_packed, jointBegin, jointEnd, N);
 136 | 
 137 |     bool sloppy = (configuration.islandMode == Configuration::Island_SingleSloppy || configuration.islandMode == Configuration::Island_MultipleSloppy);
 138 |     int batchSize = sloppy ? 512 : jointEnd - jointBegin;
 139 |     int batchCount = ((jointEnd - jointBegin) + batchSize - 1) / batchSize;
 140 | 
 141 |     {
 142 |         MICROPROFILE_SCOPEI("Physics", "Prepare", -1);
 143 | 
 144 |         {
 145 |             MICROPROFILE_SCOPEI("Physics", "RefreshJoints", -1);
 146 | 
 147 |             parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) {
 148 |                 int batchBegin = jointBegin + batchIndex * batchSize;
 149 |                 int batchEnd = std::min(batchBegin + batchSize, jointEnd);
 150 | 
 151 |                 RefreshJoints<N>(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd), contactPoints);
 152 |                 RefreshJoints<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd, contactPoints);
 153 |             });
 154 |         }
 155 | 
 156 |         {
 157 |             MICROPROFILE_SCOPEI("Physics", "PreStepJoints", -1);
 158 | 
 159 |             parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) {
 160 |                 int batchBegin = jointBegin + batchIndex * batchSize;
 161 |                 int batchEnd = std::min(batchBegin + batchSize, jointEnd);
 162 | 
 163 |                 PreStepJoints<N>(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd));
 164 |                 PreStepJoints<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd);
 165 |             });
 166 |         }
 167 |     }
 168 | 
 169 |     AlignedArray<bool> productivew;
 170 |     productivew.resize(queue.getWorkerCount() + 1);
 171 | 
 172 |     {
 173 |         MICROPROFILE_SCOPEI("Physics", "Impulse", -1);
 174 | 
 175 |         for (int iterationIndex = 0; iterationIndex < configuration.contactIterationsCount; iterationIndex++)
 176 |         {
 177 |             MICROPROFILE_SCOPEI("Physics", "ImpulseIteration", -1);
 178 | 
 179 |             memset(productivew.data, 0, productivew.size * sizeof(bool));
 180 | 
 181 |             parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) {
 182 |                 int batchBegin = jointBegin + batchIndex * batchSize;
 183 |                 int batchEnd = std::min(batchBegin + batchSize, jointEnd);
 184 | 
 185 |                 productivew[worker] |= SolveJointsImpulses<N>(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd), iterationIndex);
 186 |                 productivew[worker] |= SolveJointsImpulses<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd, iterationIndex);
 187 |             });
 188 | 
 189 |             if (!any(productivew)) break;
 190 |         }
 191 |     }
 192 | 
 193 |     {
 194 |         MICROPROFILE_SCOPEI("Physics", "Displacement", -1);
 195 | 
 196 |         for (int iterationIndex = 0; iterationIndex < configuration.penetrationIterationsCount; iterationIndex++)
 197 |         {
 198 |             MICROPROFILE_SCOPEI("Physics", "DisplacementIteration", -1);
 199 | 
 200 |             memset(productivew.data, 0, productivew.size * sizeof(bool));
 201 | 
 202 |             parallelFor(queue, 0, batchCount, 1, [&](int batchIndex, int worker) {
 203 |                 int batchBegin = jointBegin + batchIndex * batchSize;
 204 |                 int batchEnd = std::min(batchBegin + batchSize, jointEnd);
 205 | 
 206 |                 productivew[worker] |= SolveJointsDisplacement<N>(joint_packed.data, batchBegin, std::min(groupOffset, batchEnd), iterationIndex);
 207 |                 productivew[worker] |= SolveJointsDisplacement<1>(joint_packed.data, std::max(groupOffset, batchBegin), batchEnd, iterationIndex);
 208 |             });
 209 | 
 210 |             if (!any(productivew)) break;
 211 |         }
 212 |     }
 213 | 
 214 |     FinishJoints(queue, joint_packed, jointBegin, jointEnd);
 215 | }
 216 | 
 217 | NOINLINE int Solver::PrepareIndices(int jointBegin, int jointEnd, int groupSizeTarget)
 218 | {
 219 |     MICROPROFILE_SCOPEI("Physics", "PrepareIndices", -1);
 220 | 
 221 |     if (groupSizeTarget == 1)
 222 |         return jointEnd;
 223 | 
 224 |     for (int i = jointBegin; i < jointEnd; ++i)
 225 |         jointGroup_joints[i] = joint_index[i];
 226 | 
 227 |     int tag = 0;
 228 | 
 229 |     int remainingJoints = jointEnd - jointBegin;
 230 |     int groupOffset = jointBegin;
 231 | 
 232 |     while (remainingJoints >= groupSizeTarget)
 233 |     {
 234 |         // gather a group of N joints with non-overlapping bodies
 235 |         int groupSize = 0;
 236 | 
 237 |         tag++;
 238 | 
 239 |         for (int i = 0; i < remainingJoints && groupSize < groupSizeTarget;)
 240 |         {
 241 |             int jointIndex = jointGroup_joints[jointBegin + i];
 242 |             ContactJoint& joint = contactJoints[jointIndex];
 243 | 
 244 |             // TODO: race between static bodies from different islands
 245 |             if (jointGroup_bodies[joint.body1Index] < tag && jointGroup_bodies[joint.body2Index] < tag)
 246 |             {
 247 |                 jointGroup_bodies[joint.body1Index] = tag;
 248 |                 jointGroup_bodies[joint.body2Index] = tag;
 249 | 
 250 |                 joint_index[groupOffset + groupSize] = jointIndex;
 251 |                 groupSize++;
 252 | 
 253 |                 jointGroup_joints[jointBegin + i] = jointGroup_joints[jointBegin + remainingJoints - 1];
 254 |                 remainingJoints--;
 255 |             }
 256 |             else
 257 |             {
 258 |                 i++;
 259 |             }
 260 |         }
 261 | 
 262 |         groupOffset += groupSize;
 263 | 
 264 |         if (groupSize < groupSizeTarget)
 265 |             break;
 266 |     }
 267 | 
 268 |     // fill in the rest of the joints sequentially - they don't form a group so we'll have to solve them 1 by 1
 269 |     for (int i = 0; i < remainingJoints; ++i)
 270 |         joint_index[groupOffset + i] = jointGroup_joints[jointBegin + i];
 271 | 
 272 |     return groupOffset & ~(groupSizeTarget - 1);
 273 | }
 274 | 
 275 | static int remap(AlignedArray<int>& table, int index)
 276 | {
 277 |     int result = index;
 278 | 
 279 |     while (result != table[result])
 280 |         result = table[result];
 281 | 
 282 |     return table[index] = result;
 283 | }
 284 | 
 285 | NOINLINE int Solver::GatherIslands(RigidBody* bodies, int bodiesCount, int groupSizeTarget)
 286 | {
 287 |     MICROPROFILE_SCOPEI("Physics", "GatherIslands", -1);
 288 | 
 289 |     int jointCount = contactJoints.size;
 290 |     int jointCountAligned = jointCount;
 291 | 
 292 |     island_remap.resize(bodiesCount);
 293 |     island_index.resize(bodiesCount);
 294 |     island_indexremap.resize(bodiesCount);
 295 |     island_offset.resize(bodiesCount);
 296 |     island_offsettemp.resize(bodiesCount);
 297 |     island_size.resize(bodiesCount);
 298 | 
 299 |     {
 300 |         MICROPROFILE_SCOPEI("Physics", "Prepare", -1);
 301 | 
 302 |         for (int i = 0; i < bodiesCount; ++i)
 303 |         {
 304 |             island_remap[i] = (bodies[i].invMass == 0 && bodies[i].invInertia == 0) ? -1 : i;
 305 |         }
 306 |     }
 307 | 
 308 |     {
 309 |         MICROPROFILE_SCOPEI("Physics", "Merge", -1);
 310 | 
 311 |         for (ContactJoint& j: contactJoints)
 312 |         {
 313 |             int island1 = island_remap[j.body1Index];
 314 |             int island2 = island_remap[j.body2Index];
 315 | 
 316 |             if ((island1 | island2) < 0)
 317 |                 continue;
 318 | 
 319 |             int remap1 = remap(island_remap, island1);
 320 |             int remap2 = remap(island_remap, island2);
 321 | 
 322 |             island_remap[remap1] = remap2;
 323 |         }
 324 |     }
 325 | 
 326 |     {
 327 |         MICROPROFILE_SCOPEI("Physics", "Gather", -1);
 328 | 
 329 |         islandCount = 0;
 330 | 
 331 |         for (int i = 0; i < bodiesCount; ++i)
 332 |         {
 333 |             island_index[i] = -1;
 334 |         }
 335 | 
 336 |         for (int i = 0; i < bodiesCount; ++i)
 337 |         {
 338 |             if (island_remap[i] < 0)
 339 |                 continue;
 340 | 
 341 |             island_remap[i] = remap(island_remap, i);
 342 |         }
 343 | 
 344 |         for (int i = 0; i < bodiesCount; ++i)
 345 |         {
 346 |             if (island_remap[i] < 0)
 347 |                 continue;
 348 | 
 349 |             int island = island_remap[i];
 350 | 
 351 |             if (island_index[island] < 0)
 352 |             {
 353 |                 island_index[island] = islandCount;
 354 |                 islandCount++;
 355 |             }
 356 |         }
 357 |     }
 358 | 
 359 |     {
 360 |         MICROPROFILE_SCOPEI("Physics", "Count", -1);
 361 | 
 362 |         for (int i = 0; i < islandCount; ++i)
 363 |         {
 364 |             island_offset[i] = 0;
 365 |         }
 366 | 
 367 |         for (ContactJoint& j: contactJoints)
 368 |         {
 369 |             int island1 = island_remap[j.body1Index];
 370 |             int island2 = island_remap[j.body2Index];
 371 | 
 372 |             if ((island1 & island2) < 0)
 373 |                 continue;
 374 | 
 375 |             assert(island1 == island2 || ((island1 | island2) < 0 && (island1 & island2) >= 0));
 376 |             int island = island1 < 0 ? island2 : island1;
 377 | 
 378 |             island_offset[island_index[island]]++;
 379 |         }
 380 |     }
 381 | 
 382 |     {
 383 |         MICROPROFILE_SCOPEI("Physics", "Coalesce", -1);
 384 | 
 385 |         for (int i = 0; i < islandCount; ++i)
 386 |         {
 387 |             island_indexremap[i] = i;
 388 |         }
 389 | 
 390 |         int runningIndex = 0;
 391 |         int runningCount = 0;
 392 |         int totalCount = 0;
 393 | 
 394 |         for (int i = 0; i < islandCount; ++i)
 395 |         {
 396 |             runningCount += island_offset[i];
 397 | 
 398 |             island_indexremap[i] = runningIndex;
 399 | 
 400 |             if (runningCount >= kIslandMinSize || (runningCount > 0 && i == islandCount - 1))
 401 |             {
 402 |                 int runningCountAligned = (runningCount + groupSizeTarget - 1) & ~(groupSizeTarget - 1);
 403 | 
 404 |                 island_size[runningIndex] = runningCount;
 405 |                 island_offset[runningIndex] = totalCount;
 406 | 
 407 |                 totalCount += runningCountAligned;
 408 |                 runningCount = 0;
 409 |                 runningIndex++;
 410 |             }
 411 |         }
 412 | 
 413 |         jointCountAligned = totalCount;
 414 |         islandCount = runningIndex;
 415 |     }
 416 | 
 417 |     joint_index.resize(jointCountAligned);
 418 | 
 419 |     {
 420 |         MICROPROFILE_SCOPEI("Physics", "Index", -1);
 421 | 
 422 |         for (int i = 0; i < islandCount; ++i)
 423 |         {
 424 |             island_offsettemp[i] = island_offset[i];
 425 |         }
 426 | 
 427 |         for (int jointIndex = 0; jointIndex < jointCount; ++jointIndex)
 428 |         {
 429 |             ContactJoint& j = contactJoints[jointIndex];
 430 | 
 431 |             int island1 = island_remap[j.body1Index];
 432 |             int island2 = island_remap[j.body2Index];
 433 | 
 434 |             if ((island1 & island2) < 0)
 435 |                 continue;
 436 | 
 437 |             assert(island1 == island2 || ((island1 | island2) < 0 && (island1 & island2) >= 0));
 438 |             int island = island1 < 0 ? island2 : island1;
 439 | 
 440 |             joint_index[island_offsettemp[island_indexremap[island_index[island]]]++] = jointIndex;
 441 |         }
 442 | 
 443 |         islandMaxSize = 0;
 444 | 
 445 |         for (int i = 0; i < islandCount; ++i)
 446 |         {
 447 |             assert(island_offsettemp[i] == island_offset[i] + island_size[i]);
 448 | 
 449 |             islandMaxSize = std::max(islandMaxSize, island_size[i]);
 450 |         }
 451 |     }
 452 | 
 453 |     return jointCountAligned;
 454 | }
 455 | 
 456 | NOINLINE void Solver::PrepareBodies(RigidBody* bodies, int bodiesCount)
 457 | {
 458 |     MICROPROFILE_SCOPEI("Physics", "PrepareBodies", -1);
 459 | 
 460 |     solveBodiesParams.resize(bodiesCount);
 461 |     solveBodiesImpulse.resize(bodiesCount);
 462 |     solveBodiesDisplacement.resize(bodiesCount);
 463 | 
 464 |     for (int i = 0; i < bodiesCount; ++i)
 465 |     {
 466 |         solveBodiesParams[i].invMass = bodies[i].invMass;
 467 |         solveBodiesParams[i].invInertia = bodies[i].invInertia;
 468 |         solveBodiesParams[i].coords_pos = bodies[i].coords.pos;
 469 |         solveBodiesParams[i].coords_xVector = bodies[i].coords.xVector;
 470 |         solveBodiesParams[i].coords_yVector = bodies[i].coords.yVector;
 471 | 
 472 |         solveBodiesImpulse[i].velocity = bodies[i].velocity;
 473 |         solveBodiesImpulse[i].angularVelocity = bodies[i].angularVelocity;
 474 |         solveBodiesImpulse[i].lastIteration = -1;
 475 | 
 476 |         solveBodiesDisplacement[i].velocity = bodies[i].displacingVelocity;
 477 |         solveBodiesDisplacement[i].angularVelocity = bodies[i].displacingAngularVelocity;
 478 |         solveBodiesDisplacement[i].lastIteration = -1;
 479 |     }
 480 | }
 481 | 
 482 | NOINLINE void Solver::FinishBodies(RigidBody* bodies, int bodiesCount)
 483 | {
 484 |     MICROPROFILE_SCOPEI("Physics", "FinishBodies", -1);
 485 | 
 486 |     for (int i = 0; i < bodiesCount; ++i)
 487 |     {
 488 |         bodies[i].velocity = solveBodiesImpulse[i].velocity;
 489 |         bodies[i].angularVelocity = solveBodiesImpulse[i].angularVelocity;
 490 | 
 491 |         bodies[i].displacingVelocity = solveBodiesDisplacement[i].velocity;
 492 |         bodies[i].displacingAngularVelocity = solveBodiesDisplacement[i].angularVelocity;
 493 |     }
 494 | }
 495 | 
 496 | template <int N>
 497 | NOINLINE int Solver::PrepareJoints(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, int jointBegin, int jointEnd, int groupSizeTarget)
 498 | {
 499 |     MICROPROFILE_SCOPEI("Physics", "PrepareJoints", -1);
 500 | 
 501 |     assert(jointBegin % groupSizeTarget == 0);
 502 |     assert(jointBegin % N == 0);
 503 | 
 504 |     int groupOffset = PrepareIndices(jointBegin, jointEnd, groupSizeTarget);
 505 | 
 506 |     {
 507 |         MICROPROFILE_SCOPEI("Physics", "CopyJoints", -1);
 508 | 
 509 |         parallelFor(queue, 0, jointEnd - jointBegin, 128, [&](int i, int) {
 510 |             ContactJoint& joint = contactJoints[joint_index[jointBegin + i]];
 511 | 
 512 |             ContactJointPacked<N>& jointP = joint_packed[unsigned(jointBegin + i) / N];
 513 |             int iP = i & (N - 1);
 514 | 
 515 |             jointP.body1Index[iP] = joint.body1Index;
 516 |             jointP.body2Index[iP] = joint.body2Index;
 517 |             jointP.contactPointIndex[iP] = joint.contactPointIndex;
 518 | 
 519 |             jointP.normalLimiter_accumulatedImpulse[iP] = joint.normalLimiter_accumulatedImpulse;
 520 |             jointP.frictionLimiter_accumulatedImpulse[iP] = joint.frictionLimiter_accumulatedImpulse;
 521 |         });
 522 |     }
 523 | 
 524 |     return groupOffset;
 525 | }
 526 | 
 527 | template <int N>
 528 | NOINLINE void Solver::FinishJoints(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, int jointBegin, int jointEnd)
 529 | {
 530 |     MICROPROFILE_SCOPEI("Physics", "FinishJoints", -1);
 531 | 
 532 |     assert(jointBegin % N == 0);
 533 | 
 534 |     {
 535 |         MICROPROFILE_SCOPEI("Physics", "CopyJoints", -1);
 536 | 
 537 |         parallelFor(queue, 0, jointEnd - jointBegin, 128, [&](int i, int) {
 538 |             ContactJoint& joint = contactJoints[joint_index[jointBegin + i]];
 539 | 
 540 |             ContactJointPacked<N>& jointP = joint_packed[unsigned(jointBegin + i) / N];
 541 |             int iP = i & (N - 1);
 542 | 
 543 |             joint.normalLimiter_accumulatedImpulse = jointP.normalLimiter_accumulatedImpulse[iP];
 544 |             joint.frictionLimiter_accumulatedImpulse = jointP.frictionLimiter_accumulatedImpulse[iP];
 545 |         });
 546 |     }
 547 | }
 548 | 
 549 | template <typename Vf, int N>
 550 | static void RefreshLimiter(
 551 |     ContactLimiterPacked<N>& limiter, int iP,
 552 |     const Vf& n1X, const Vf& n1Y, const Vf& n2X, const Vf& n2Y, const Vf& w1X, const Vf& w1Y, const Vf& w2X, const Vf& w2Y,
 553 |     const Vf& body1_invMass, const Vf& body1_invInertia, const Vf& body2_invMass, const Vf& body2_invInertia)
 554 | {
 555 |     Vf normalProjector1X = n1X;
 556 |     Vf normalProjector1Y = n1Y;
 557 |     Vf normalProjector2X = n2X;
 558 |     Vf normalProjector2Y = n2Y;
 559 |     Vf angularProjector1 = n1X * w1Y - n1Y * w1X;
 560 |     Vf angularProjector2 = n2X * w2Y - n2Y * w2X;
 561 | 
 562 |     Vf compMass1_linearX = normalProjector1X * body1_invMass;
 563 |     Vf compMass1_linearY = normalProjector1Y * body1_invMass;
 564 |     Vf compMass1_angular = angularProjector1 * body1_invInertia;
 565 |     Vf compMass2_linearX = normalProjector2X * body2_invMass;
 566 |     Vf compMass2_linearY = normalProjector2Y * body2_invMass;
 567 |     Vf compMass2_angular = angularProjector2 * body2_invInertia;
 568 | 
 569 |     Vf compMass1 = normalProjector1X * compMass1_linearX + normalProjector1Y * compMass1_linearY + angularProjector1 * compMass1_angular;
 570 |     Vf compMass2 = normalProjector2X * compMass2_linearX + normalProjector2Y * compMass2_linearY + angularProjector2 * compMass2_angular;
 571 | 
 572 |     Vf compMass = compMass1 + compMass2;
 573 | 
 574 |     Vf compInvMass = select(Vf::zero(), Vf::one(1) / compMass, abs(compMass) > Vf::zero());
 575 | 
 576 |     store(normalProjector1X, &limiter.normalProjector1X[iP]);
 577 |     store(normalProjector1Y, &limiter.normalProjector1Y[iP]);
 578 |     store(normalProjector2X, &limiter.normalProjector2X[iP]);
 579 |     store(normalProjector2Y, &limiter.normalProjector2Y[iP]);
 580 |     store(angularProjector1, &limiter.angularProjector1[iP]);
 581 |     store(angularProjector2, &limiter.angularProjector2[iP]);
 582 | 
 583 |     store(compMass1_linearX, &limiter.compMass1_linearX[iP]);
 584 |     store(compMass1_linearY, &limiter.compMass1_linearY[iP]);
 585 |     store(compMass2_linearX, &limiter.compMass2_linearX[iP]);
 586 |     store(compMass2_linearY, &limiter.compMass2_linearY[iP]);
 587 |     store(compMass1_angular, &limiter.compMass1_angular[iP]);
 588 |     store(compMass2_angular, &limiter.compMass2_angular[iP]);
 589 |     store(compInvMass, &limiter.compInvMass[iP]);
 590 | }
 591 | 
 592 | template <int VN, int N>
 593 | NOINLINE void Solver::RefreshJoints(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints)
 594 | {
 595 |     typedef simd::VNf<VN> Vf;
 596 | 
 597 |     assert(jointBegin % VN == 0 && jointEnd % VN == 0);
 598 | 
 599 |     for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN)
 600 |     {
 601 |         int i = jointIndex;
 602 | 
 603 |         ContactJointPacked<N>& jointP = joint_packed[unsigned(i) / N];
 604 |         int iP = (VN == N) ? 0 : i & (N - 1);
 605 | 
 606 |         Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf;
 607 |         Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf;
 608 | 
 609 |         Vf body1_invMass, body1_invInertia, body1_coords_posX, body1_coords_posY;
 610 |         Vf body1_coords_xVectorX, body1_coords_xVectorY, body1_coords_yVectorX, body1_coords_yVectorY;
 611 | 
 612 |         Vf body2_invMass, body2_invInertia, body2_coords_posX, body2_coords_posY;
 613 |         Vf body2_coords_xVectorX, body2_coords_xVectorY, body2_coords_yVectorX, body2_coords_yVectorY;
 614 | 
 615 |         Vf collision_delta1X, collision_delta1Y, collision_delta2X, collision_delta2Y;
 616 |         Vf collision_normalX, collision_normalY;
 617 |         Vf dummy;
 618 | 
 619 |         loadindexed4(
 620 |             body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
 621 |             solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody));
 622 | 
 623 |         loadindexed4(
 624 |             body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
 625 |             solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody));
 626 | 
 627 |         loadindexed8(
 628 |             body1_invMass, body1_invInertia, body1_coords_posX, body1_coords_posY,
 629 |             body1_coords_xVectorX, body1_coords_xVectorY, body1_coords_yVectorX, body1_coords_yVectorY,
 630 |             solveBodiesParams.data, jointP.body1Index + iP, sizeof(SolveBodyParams));
 631 | 
 632 |         loadindexed8(
 633 |             body2_invMass, body2_invInertia, body2_coords_posX, body2_coords_posY,
 634 |             body2_coords_xVectorX, body2_coords_xVectorY, body2_coords_yVectorX, body2_coords_yVectorY,
 635 |             solveBodiesParams.data, jointP.body2Index + iP, sizeof(SolveBodyParams));
 636 | 
 637 |         loadindexed8(
 638 |             collision_delta1X, collision_delta1Y, collision_delta2X, collision_delta2Y,
 639 |             collision_normalX, collision_normalY, dummy, dummy,
 640 |             contactPoints, jointP.contactPointIndex + iP, sizeof(ContactPoint));
 641 | 
 642 |         Vf point1X = collision_delta1X + body1_coords_posX;
 643 |         Vf point1Y = collision_delta1Y + body1_coords_posY;
 644 |         Vf point2X = collision_delta2X + body2_coords_posX;
 645 |         Vf point2Y = collision_delta2Y + body2_coords_posY;
 646 | 
 647 |         Vf w1X = collision_delta1X;
 648 |         Vf w1Y = collision_delta1Y;
 649 |         Vf w2X = point1X - body2_coords_posX;
 650 |         Vf w2Y = point1Y - body2_coords_posY;
 651 | 
 652 |         // Normal limiter
 653 |         RefreshLimiter(jointP.normalLimiter, iP,
 654 |             collision_normalX, collision_normalY, -collision_normalX, -collision_normalY,
 655 |             w1X, w1Y, w2X, w2Y,
 656 |             body1_invMass, body1_invInertia, body2_invMass, body2_invInertia);
 657 | 
 658 |         Vf bounce = Vf::zero();
 659 |         Vf deltaVelocity = Vf::one(1.f);
 660 |         Vf maxPenetrationVelocity = Vf::one(0.1f);
 661 |         Vf deltaDepth = Vf::one(1.f);
 662 |         Vf errorReduction = Vf::one(0.1f);
 663 | 
 664 |         Vf pointVelocity_body1X = (body1_coords_posY - point1Y) * body1_angularVelocity + body1_velocityX;
 665 |         Vf pointVelocity_body1Y = (point1X - body1_coords_posX) * body1_angularVelocity + body1_velocityY;
 666 | 
 667 |         Vf pointVelocity_body2X = (body2_coords_posY - point2Y) * body2_angularVelocity + body2_velocityX;
 668 |         Vf pointVelocity_body2Y = (point2X - body2_coords_posX) * body2_angularVelocity + body2_velocityY;
 669 | 
 670 |         Vf relativeVelocityX = pointVelocity_body1X - pointVelocity_body2X;
 671 |         Vf relativeVelocityY = pointVelocity_body1Y - pointVelocity_body2Y;
 672 | 
 673 |         Vf dv = -bounce * (relativeVelocityX * collision_normalX + relativeVelocityY * collision_normalY);
 674 |         Vf depth = (point2X - point1X) * collision_normalX + (point2Y - point1Y) * collision_normalY;
 675 | 
 676 |         Vf dstVelocity = max(dv - deltaVelocity, Vf::zero());
 677 | 
 678 |         Vf j_normalLimiter_dstVelocity = select(dstVelocity, dstVelocity - maxPenetrationVelocity, depth < deltaDepth);
 679 |         Vf j_normalLimiter_dstDisplacingVelocity = errorReduction * max(Vf::zero(), depth - Vf::one(2.0f) * deltaDepth);
 680 |         Vf j_normalLimiter_accumulatedDisplacingImpulse = Vf::zero();
 681 | 
 682 |         // Friction limiter
 683 |         Vf tangentX = -collision_normalY;
 684 |         Vf tangentY = collision_normalX;
 685 | 
 686 |         RefreshLimiter(jointP.frictionLimiter, iP,
 687 |             tangentX, tangentY, -tangentX, -tangentY,
 688 |             w1X, w1Y, w2X, w2Y,
 689 |             body1_invMass, body1_invInertia, body2_invMass, body2_invInertia);
 690 | 
 691 |         store(j_normalLimiter_dstVelocity, &jointP.normalLimiter_dstVelocity[iP]);
 692 |         store(j_normalLimiter_dstDisplacingVelocity, &jointP.normalLimiter_dstDisplacingVelocity[iP]);
 693 |         store(j_normalLimiter_accumulatedDisplacingImpulse, &jointP.normalLimiter_accumulatedDisplacingImpulse[iP]);
 694 |     }
 695 | }
 696 | 
 697 | template <int VN, int N>
 698 | NOINLINE void Solver::PreStepJoints(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd)
 699 | {
 700 |     typedef simd::VNf<VN> Vf;
 701 | 
 702 |     assert(jointBegin % VN == 0 && jointEnd % VN == 0);
 703 | 
 704 |     for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN)
 705 |     {
 706 |         int i = jointIndex;
 707 | 
 708 |         ContactJointPacked<N>& jointP = joint_packed[unsigned(i) / N];
 709 |         int iP = (VN == N) ? 0 : i & (N - 1);
 710 | 
 711 |         Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf;
 712 |         Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf;
 713 | 
 714 |         loadindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
 715 |             solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody));
 716 | 
 717 |         loadindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
 718 |             solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody));
 719 | 
 720 |         Vf j_normalLimiter_compMass1_linearX = Vf::load(&jointP.normalLimiter.compMass1_linearX[iP]);
 721 |         Vf j_normalLimiter_compMass1_linearY = Vf::load(&jointP.normalLimiter.compMass1_linearY[iP]);
 722 |         Vf j_normalLimiter_compMass2_linearX = Vf::load(&jointP.normalLimiter.compMass2_linearX[iP]);
 723 |         Vf j_normalLimiter_compMass2_linearY = Vf::load(&jointP.normalLimiter.compMass2_linearY[iP]);
 724 |         Vf j_normalLimiter_compMass1_angular = Vf::load(&jointP.normalLimiter.compMass1_angular[iP]);
 725 |         Vf j_normalLimiter_compMass2_angular = Vf::load(&jointP.normalLimiter.compMass2_angular[iP]);
 726 |         Vf j_normalLimiter_accumulatedImpulse = Vf::load(&jointP.normalLimiter_accumulatedImpulse[iP]);
 727 | 
 728 |         Vf j_frictionLimiter_compMass1_linearX = Vf::load(&jointP.frictionLimiter.compMass1_linearX[iP]);
 729 |         Vf j_frictionLimiter_compMass1_linearY = Vf::load(&jointP.frictionLimiter.compMass1_linearY[iP]);
 730 |         Vf j_frictionLimiter_compMass2_linearX = Vf::load(&jointP.frictionLimiter.compMass2_linearX[iP]);
 731 |         Vf j_frictionLimiter_compMass2_linearY = Vf::load(&jointP.frictionLimiter.compMass2_linearY[iP]);
 732 |         Vf j_frictionLimiter_compMass1_angular = Vf::load(&jointP.frictionLimiter.compMass1_angular[iP]);
 733 |         Vf j_frictionLimiter_compMass2_angular = Vf::load(&jointP.frictionLimiter.compMass2_angular[iP]);
 734 |         Vf j_frictionLimiter_accumulatedImpulse = Vf::load(&jointP.frictionLimiter_accumulatedImpulse[iP]);
 735 | 
 736 |         body1_velocityX += j_normalLimiter_compMass1_linearX * j_normalLimiter_accumulatedImpulse;
 737 |         body1_velocityY += j_normalLimiter_compMass1_linearY * j_normalLimiter_accumulatedImpulse;
 738 |         body1_angularVelocity += j_normalLimiter_compMass1_angular * j_normalLimiter_accumulatedImpulse;
 739 | 
 740 |         body2_velocityX += j_normalLimiter_compMass2_linearX * j_normalLimiter_accumulatedImpulse;
 741 |         body2_velocityY += j_normalLimiter_compMass2_linearY * j_normalLimiter_accumulatedImpulse;
 742 |         body2_angularVelocity += j_normalLimiter_compMass2_angular * j_normalLimiter_accumulatedImpulse;
 743 | 
 744 |         body1_velocityX += j_frictionLimiter_compMass1_linearX * j_frictionLimiter_accumulatedImpulse;
 745 |         body1_velocityY += j_frictionLimiter_compMass1_linearY * j_frictionLimiter_accumulatedImpulse;
 746 |         body1_angularVelocity += j_frictionLimiter_compMass1_angular * j_frictionLimiter_accumulatedImpulse;
 747 | 
 748 |         body2_velocityX += j_frictionLimiter_compMass2_linearX * j_frictionLimiter_accumulatedImpulse;
 749 |         body2_velocityY += j_frictionLimiter_compMass2_linearY * j_frictionLimiter_accumulatedImpulse;
 750 |         body2_angularVelocity += j_frictionLimiter_compMass2_angular * j_frictionLimiter_accumulatedImpulse;
 751 | 
 752 |         storeindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
 753 |             solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody));
 754 | 
 755 |         storeindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
 756 |             solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody));
 757 |     }
 758 | }
 759 | 
 760 | template <int VN, int N>
 761 | NOINLINE bool Solver::SolveJointsImpulses(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd, int iterationIndex)
 762 | {
 763 |     typedef simd::VNf<VN> Vf;
 764 |     typedef simd::VNi<VN> Vi;
 765 |     typedef simd::VNb<VN> Vb;
 766 | 
 767 |     assert(jointBegin % VN == 0 && jointEnd % VN == 0);
 768 | 
 769 |     Vi iterationIndex0 = Vi::one(iterationIndex);
 770 |     Vi iterationIndex2 = Vi::one(iterationIndex - 2);
 771 | 
 772 |     Vb productive_any = Vb::zero();
 773 | 
 774 |     for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN)
 775 |     {
 776 |         int i = jointIndex;
 777 | 
 778 |         ContactJointPacked<N>& jointP = joint_packed[unsigned(i) / N];
 779 |         int iP = (VN == N) ? 0 : i & (N - 1);
 780 | 
 781 |         Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf;
 782 |         Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf;
 783 | 
 784 |         loadindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
 785 |             solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody));
 786 | 
 787 |         loadindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
 788 |             solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody));
 789 | 
 790 |         Vi body1_lastIteration = bitcast(body1_lastIterationf);
 791 |         Vi body2_lastIteration = bitcast(body2_lastIterationf);
 792 | 
 793 |         Vb body1_productive = body1_lastIteration > iterationIndex2;
 794 |         Vb body2_productive = body2_lastIteration > iterationIndex2;
 795 |         Vb body_productive = body1_productive | body2_productive;
 796 | 
 797 |         if (none(body_productive))
 798 |             continue;
 799 | 
 800 |         Vf j_normalLimiter_normalProjector1X = Vf::load(&jointP.normalLimiter.normalProjector1X[iP]);
 801 |         Vf j_normalLimiter_normalProjector1Y = Vf::load(&jointP.normalLimiter.normalProjector1Y[iP]);
 802 |         Vf j_normalLimiter_normalProjector2X = Vf::load(&jointP.normalLimiter.normalProjector2X[iP]);
 803 |         Vf j_normalLimiter_normalProjector2Y = Vf::load(&jointP.normalLimiter.normalProjector2Y[iP]);
 804 |         Vf j_normalLimiter_angularProjector1 = Vf::load(&jointP.normalLimiter.angularProjector1[iP]);
 805 |         Vf j_normalLimiter_angularProjector2 = Vf::load(&jointP.normalLimiter.angularProjector2[iP]);
 806 | 
 807 |         Vf j_normalLimiter_compMass1_linearX = Vf::load(&jointP.normalLimiter.compMass1_linearX[iP]);
 808 |         Vf j_normalLimiter_compMass1_linearY = Vf::load(&jointP.normalLimiter.compMass1_linearY[iP]);
 809 |         Vf j_normalLimiter_compMass2_linearX = Vf::load(&jointP.normalLimiter.compMass2_linearX[iP]);
 810 |         Vf j_normalLimiter_compMass2_linearY = Vf::load(&jointP.normalLimiter.compMass2_linearY[iP]);
 811 |         Vf j_normalLimiter_compMass1_angular = Vf::load(&jointP.normalLimiter.compMass1_angular[iP]);
 812 |         Vf j_normalLimiter_compMass2_angular = Vf::load(&jointP.normalLimiter.compMass2_angular[iP]);
 813 |         Vf j_normalLimiter_compInvMass = Vf::load(&jointP.normalLimiter.compInvMass[iP]);
 814 |         Vf j_normalLimiter_accumulatedImpulse = Vf::load(&jointP.normalLimiter_accumulatedImpulse[iP]);
 815 |         Vf j_normalLimiter_dstVelocity = Vf::load(&jointP.normalLimiter_dstVelocity[iP]);
 816 | 
 817 |         Vf j_frictionLimiter_normalProjector1X = Vf::load(&jointP.frictionLimiter.normalProjector1X[iP]);
 818 |         Vf j_frictionLimiter_normalProjector1Y = Vf::load(&jointP.frictionLimiter.normalProjector1Y[iP]);
 819 |         Vf j_frictionLimiter_normalProjector2X = Vf::load(&jointP.frictionLimiter.normalProjector2X[iP]);
 820 |         Vf j_frictionLimiter_normalProjector2Y = Vf::load(&jointP.frictionLimiter.normalProjector2Y[iP]);
 821 |         Vf j_frictionLimiter_angularProjector1 = Vf::load(&jointP.frictionLimiter.angularProjector1[iP]);
 822 |         Vf j_frictionLimiter_angularProjector2 = Vf::load(&jointP.frictionLimiter.angularProjector2[iP]);
 823 | 
 824 |         Vf j_frictionLimiter_compMass1_linearX = Vf::load(&jointP.frictionLimiter.compMass1_linearX[iP]);
 825 |         Vf j_frictionLimiter_compMass1_linearY = Vf::load(&jointP.frictionLimiter.compMass1_linearY[iP]);
 826 |         Vf j_frictionLimiter_compMass2_linearX = Vf::load(&jointP.frictionLimiter.compMass2_linearX[iP]);
 827 |         Vf j_frictionLimiter_compMass2_linearY = Vf::load(&jointP.frictionLimiter.compMass2_linearY[iP]);
 828 |         Vf j_frictionLimiter_compMass1_angular = Vf::load(&jointP.frictionLimiter.compMass1_angular[iP]);
 829 |         Vf j_frictionLimiter_compMass2_angular = Vf::load(&jointP.frictionLimiter.compMass2_angular[iP]);
 830 |         Vf j_frictionLimiter_compInvMass = Vf::load(&jointP.frictionLimiter.compInvMass[iP]);
 831 |         Vf j_frictionLimiter_accumulatedImpulse = Vf::load(&jointP.frictionLimiter_accumulatedImpulse[iP]);
 832 | 
 833 |         Vf normaldV = j_normalLimiter_dstVelocity;
 834 | 
 835 |         normaldV -= j_normalLimiter_normalProjector1X * body1_velocityX;
 836 |         normaldV -= j_normalLimiter_normalProjector1Y * body1_velocityY;
 837 |         normaldV -= j_normalLimiter_angularProjector1 * body1_angularVelocity;
 838 | 
 839 |         normaldV -= j_normalLimiter_normalProjector2X * body2_velocityX;
 840 |         normaldV -= j_normalLimiter_normalProjector2Y * body2_velocityY;
 841 |         normaldV -= j_normalLimiter_angularProjector2 * body2_angularVelocity;
 842 | 
 843 |         Vf normalDeltaImpulse = normaldV * j_normalLimiter_compInvMass;
 844 | 
 845 |         normalDeltaImpulse = max(normalDeltaImpulse, -j_normalLimiter_accumulatedImpulse);
 846 | 
 847 |         body1_velocityX += j_normalLimiter_compMass1_linearX * normalDeltaImpulse;
 848 |         body1_velocityY += j_normalLimiter_compMass1_linearY * normalDeltaImpulse;
 849 |         body1_angularVelocity += j_normalLimiter_compMass1_angular * normalDeltaImpulse;
 850 | 
 851 |         body2_velocityX += j_normalLimiter_compMass2_linearX * normalDeltaImpulse;
 852 |         body2_velocityY += j_normalLimiter_compMass2_linearY * normalDeltaImpulse;
 853 |         body2_angularVelocity += j_normalLimiter_compMass2_angular * normalDeltaImpulse;
 854 | 
 855 |         j_normalLimiter_accumulatedImpulse += normalDeltaImpulse;
 856 | 
 857 |         Vf frictiondV = Vf::zero();
 858 | 
 859 |         frictiondV -= j_frictionLimiter_normalProjector1X * body1_velocityX;
 860 |         frictiondV -= j_frictionLimiter_normalProjector1Y * body1_velocityY;
 861 |         frictiondV -= j_frictionLimiter_angularProjector1 * body1_angularVelocity;
 862 | 
 863 |         frictiondV -= j_frictionLimiter_normalProjector2X * body2_velocityX;
 864 |         frictiondV -= j_frictionLimiter_normalProjector2Y * body2_velocityY;
 865 |         frictiondV -= j_frictionLimiter_angularProjector2 * body2_angularVelocity;
 866 | 
 867 |         Vf frictionDeltaImpulse = frictiondV * j_frictionLimiter_compInvMass;
 868 | 
 869 |         Vf reactionForce = j_normalLimiter_accumulatedImpulse;
 870 |         Vf accumulatedImpulse = j_frictionLimiter_accumulatedImpulse;
 871 | 
 872 |         Vf frictionForce = accumulatedImpulse + frictionDeltaImpulse;
 873 |         Vf reactionForceScaled = reactionForce * Vf::one(kFrictionCoefficient);
 874 | 
 875 |         Vf frictionForceAbs = abs(frictionForce);
 876 |         Vf reactionForceScaledSigned = flipsign(reactionForceScaled, frictionForce);
 877 |         Vf frictionDeltaImpulseAdjusted = reactionForceScaledSigned - accumulatedImpulse;
 878 | 
 879 |         frictionDeltaImpulse = select(frictionDeltaImpulse, frictionDeltaImpulseAdjusted, frictionForceAbs > reactionForceScaled);
 880 | 
 881 |         j_frictionLimiter_accumulatedImpulse += frictionDeltaImpulse;
 882 | 
 883 |         body1_velocityX += j_frictionLimiter_compMass1_linearX * frictionDeltaImpulse;
 884 |         body1_velocityY += j_frictionLimiter_compMass1_linearY * frictionDeltaImpulse;
 885 |         body1_angularVelocity += j_frictionLimiter_compMass1_angular * frictionDeltaImpulse;
 886 | 
 887 |         body2_velocityX += j_frictionLimiter_compMass2_linearX * frictionDeltaImpulse;
 888 |         body2_velocityY += j_frictionLimiter_compMass2_linearY * frictionDeltaImpulse;
 889 |         body2_angularVelocity += j_frictionLimiter_compMass2_angular * frictionDeltaImpulse;
 890 | 
 891 |         store(j_normalLimiter_accumulatedImpulse, &jointP.normalLimiter_accumulatedImpulse[iP]);
 892 |         store(j_frictionLimiter_accumulatedImpulse, &jointP.frictionLimiter_accumulatedImpulse[iP]);
 893 | 
 894 |         Vf cumulativeImpulse = max(abs(normalDeltaImpulse), abs(frictionDeltaImpulse));
 895 | 
 896 |         Vb productive = cumulativeImpulse > Vf::one(kProductiveImpulse);
 897 | 
 898 |         productive_any |= productive;
 899 | 
 900 |         body1_lastIteration = select(body1_lastIteration, iterationIndex0, productive);
 901 |         body2_lastIteration = select(body2_lastIteration, iterationIndex0, productive);
 902 | 
 903 |         body1_lastIterationf = bitcast(body1_lastIteration);
 904 |         body2_lastIterationf = bitcast(body2_lastIteration);
 905 | 
 906 |         storeindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
 907 |             solveBodiesImpulse.data, jointP.body1Index + iP, sizeof(SolveBody));
 908 | 
 909 |         storeindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
 910 |             solveBodiesImpulse.data, jointP.body2Index + iP, sizeof(SolveBody));
 911 |     }
 912 | 
 913 |     return any(productive_any);
 914 | }
 915 | 
 916 | template <int VN, int N>
 917 | NOINLINE bool Solver::SolveJointsDisplacement(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd, int iterationIndex)
 918 | {
 919 |     typedef simd::VNf<VN> Vf;
 920 |     typedef simd::VNi<VN> Vi;
 921 |     typedef simd::VNb<VN> Vb;
 922 | 
 923 |     assert(jointBegin % VN == 0 && jointEnd % VN == 0);
 924 | 
 925 |     Vi iterationIndex0 = Vi::one(iterationIndex);
 926 |     Vi iterationIndex2 = Vi::one(iterationIndex - 2);
 927 | 
 928 |     Vb productive_any = Vb::zero();
 929 | 
 930 |     for (int jointIndex = jointBegin; jointIndex < jointEnd; jointIndex += VN)
 931 |     {
 932 |         int i = jointIndex;
 933 | 
 934 |         ContactJointPacked<N>& jointP = joint_packed[unsigned(i) / N];
 935 |         int iP = (VN == N) ? 0 : i & (N - 1);
 936 | 
 937 |         Vf body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf;
 938 |         Vf body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf;
 939 | 
 940 |         loadindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
 941 |             solveBodiesDisplacement.data, jointP.body1Index + iP, sizeof(SolveBody));
 942 | 
 943 |         loadindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
 944 |             solveBodiesDisplacement.data, jointP.body2Index + iP, sizeof(SolveBody));
 945 | 
 946 |         Vi body1_lastIteration = bitcast(body1_lastIterationf);
 947 |         Vi body2_lastIteration = bitcast(body2_lastIterationf);
 948 | 
 949 |         Vb body1_productive = body1_lastIteration > iterationIndex2;
 950 |         Vb body2_productive = body2_lastIteration > iterationIndex2;
 951 |         Vb body_productive = body1_productive | body2_productive;
 952 | 
 953 |         if (none(body_productive))
 954 |             continue;
 955 | 
 956 |         Vf j_normalLimiter_normalProjector1X = Vf::load(&jointP.normalLimiter.normalProjector1X[iP]);
 957 |         Vf j_normalLimiter_normalProjector1Y = Vf::load(&jointP.normalLimiter.normalProjector1Y[iP]);
 958 |         Vf j_normalLimiter_normalProjector2X = Vf::load(&jointP.normalLimiter.normalProjector2X[iP]);
 959 |         Vf j_normalLimiter_normalProjector2Y = Vf::load(&jointP.normalLimiter.normalProjector2Y[iP]);
 960 |         Vf j_normalLimiter_angularProjector1 = Vf::load(&jointP.normalLimiter.angularProjector1[iP]);
 961 |         Vf j_normalLimiter_angularProjector2 = Vf::load(&jointP.normalLimiter.angularProjector2[iP]);
 962 | 
 963 |         Vf j_normalLimiter_compMass1_linearX = Vf::load(&jointP.normalLimiter.compMass1_linearX[iP]);
 964 |         Vf j_normalLimiter_compMass1_linearY = Vf::load(&jointP.normalLimiter.compMass1_linearY[iP]);
 965 |         Vf j_normalLimiter_compMass2_linearX = Vf::load(&jointP.normalLimiter.compMass2_linearX[iP]);
 966 |         Vf j_normalLimiter_compMass2_linearY = Vf::load(&jointP.normalLimiter.compMass2_linearY[iP]);
 967 |         Vf j_normalLimiter_compMass1_angular = Vf::load(&jointP.normalLimiter.compMass1_angular[iP]);
 968 |         Vf j_normalLimiter_compMass2_angular = Vf::load(&jointP.normalLimiter.compMass2_angular[iP]);
 969 |         Vf j_normalLimiter_compInvMass = Vf::load(&jointP.normalLimiter.compInvMass[iP]);
 970 |         Vf j_normalLimiter_dstDisplacingVelocity = Vf::load(&jointP.normalLimiter_dstDisplacingVelocity[iP]);
 971 |         Vf j_normalLimiter_accumulatedDisplacingImpulse = Vf::load(&jointP.normalLimiter_accumulatedDisplacingImpulse[iP]);
 972 | 
 973 |         Vf dV = j_normalLimiter_dstDisplacingVelocity;
 974 | 
 975 |         dV -= j_normalLimiter_normalProjector1X * body1_velocityX;
 976 |         dV -= j_normalLimiter_normalProjector1Y * body1_velocityY;
 977 |         dV -= j_normalLimiter_angularProjector1 * body1_angularVelocity;
 978 | 
 979 |         dV -= j_normalLimiter_normalProjector2X * body2_velocityX;
 980 |         dV -= j_normalLimiter_normalProjector2Y * body2_velocityY;
 981 |         dV -= j_normalLimiter_angularProjector2 * body2_angularVelocity;
 982 | 
 983 |         Vf displacingDeltaImpulse = dV * j_normalLimiter_compInvMass;
 984 | 
 985 |         displacingDeltaImpulse = max(displacingDeltaImpulse, -j_normalLimiter_accumulatedDisplacingImpulse);
 986 | 
 987 |         body1_velocityX += j_normalLimiter_compMass1_linearX * displacingDeltaImpulse;
 988 |         body1_velocityY += j_normalLimiter_compMass1_linearY * displacingDeltaImpulse;
 989 |         body1_angularVelocity += j_normalLimiter_compMass1_angular * displacingDeltaImpulse;
 990 | 
 991 |         body2_velocityX += j_normalLimiter_compMass2_linearX * displacingDeltaImpulse;
 992 |         body2_velocityY += j_normalLimiter_compMass2_linearY * displacingDeltaImpulse;
 993 |         body2_angularVelocity += j_normalLimiter_compMass2_angular * displacingDeltaImpulse;
 994 | 
 995 |         j_normalLimiter_accumulatedDisplacingImpulse += displacingDeltaImpulse;
 996 | 
 997 |         store(j_normalLimiter_accumulatedDisplacingImpulse, &jointP.normalLimiter_accumulatedDisplacingImpulse[iP]);
 998 | 
 999 |         Vb productive = abs(displacingDeltaImpulse) > Vf::one(kProductiveImpulse);
1000 | 
1001 |         productive_any |= productive;
1002 | 
1003 |         body1_lastIteration = select(body1_lastIteration, iterationIndex0, productive);
1004 |         body2_lastIteration = select(body2_lastIteration, iterationIndex0, productive);
1005 | 
1006 |         // this is a bit painful :(
1007 |         body1_lastIterationf = bitcast(body1_lastIteration);
1008 |         body2_lastIterationf = bitcast(body2_lastIteration);
1009 | 
1010 |         storeindexed4(body1_velocityX, body1_velocityY, body1_angularVelocity, body1_lastIterationf,
1011 |             solveBodiesDisplacement.data, jointP.body1Index + iP, sizeof(SolveBody));
1012 | 
1013 |         storeindexed4(body2_velocityX, body2_velocityY, body2_angularVelocity, body2_lastIterationf,
1014 |             solveBodiesDisplacement.data, jointP.body2Index + iP, sizeof(SolveBody));
1015 |     }
1016 | 
1017 |     return any(productive_any);
1018 | }


--------------------------------------------------------------------------------
/src/Solver.h:
--------------------------------------------------------------------------------
  1 | #include "Joints.h"
  2 | #include <assert.h>
  3 | #include <vector>
  4 | 
  5 | #include "base/AlignedArray.h"
  6 | 
  7 | template <int N>
  8 | struct ContactLimiterPacked
  9 | {
 10 |     float normalProjector1X[N];
 11 |     float normalProjector1Y[N];
 12 |     float normalProjector2X[N];
 13 |     float normalProjector2Y[N];
 14 |     float angularProjector1[N];
 15 |     float angularProjector2[N];
 16 | 
 17 |     float compMass1_linearX[N];
 18 |     float compMass1_linearY[N];
 19 |     float compMass2_linearX[N];
 20 |     float compMass2_linearY[N];
 21 |     float compMass1_angular[N];
 22 |     float compMass2_angular[N];
 23 |     float compInvMass[N];
 24 | };
 25 | 
 26 | template <int N>
 27 | struct ContactJointPacked
 28 | {
 29 |     int body1Index[N];
 30 |     int body2Index[N];
 31 |     int contactPointIndex[N];
 32 | 
 33 |     ContactLimiterPacked<N> normalLimiter;
 34 | 
 35 |     float normalLimiter_compInvMass[N];
 36 |     float normalLimiter_accumulatedImpulse[N];
 37 | 
 38 |     float normalLimiter_dstVelocity[N];
 39 |     float normalLimiter_dstDisplacingVelocity[N];
 40 |     float normalLimiter_accumulatedDisplacingImpulse[N];
 41 | 
 42 |     ContactLimiterPacked<N> frictionLimiter;
 43 | 
 44 |     float frictionLimiter_accumulatedImpulse[N];
 45 | };
 46 | 
 47 | class WorkQueue;
 48 | struct Configuration;
 49 | 
 50 | struct Solver
 51 | {
 52 |     Solver();
 53 | 
 54 |     void SolveJoints(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration);
 55 | 
 56 |     void SolveJoints_Scalar(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration);
 57 |     void SolveJoints_SSE2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration);
 58 |     void SolveJoints_AVX2(WorkQueue& queue, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration);
 59 | 
 60 |     template <int N>
 61 |     void SolveJoints(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, RigidBody* bodies, int bodiesCount, ContactPoint* contactPoints, const Configuration& configuration);
 62 | 
 63 |     int GatherIslands(RigidBody* bodies, int bodiesCount, int groupSizeTarget);
 64 |     void PrepareBodies(RigidBody* bodies, int bodiesCount);
 65 |     void FinishBodies(RigidBody* bodies, int bodiesCount);
 66 | 
 67 |     template <int N>
 68 |     void SolveJointIsland(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints, const Configuration& configuration);
 69 | 
 70 |     template <int N>
 71 |     int PrepareJoints(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, int jointBegin, int jointEnd, int groupSizeTarget);
 72 |     template <int N>
 73 |     void FinishJoints(WorkQueue& queue, AlignedArray<ContactJointPacked<N>>& joint_packed, int jointBegin, int jointEnd);
 74 | 
 75 |     int PrepareIndices(int jointBegin, int jointEnd, int groupSizeTarget);
 76 | 
 77 |     template <int VN, int N>
 78 |     void RefreshJoints(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd, ContactPoint* contactPoints);
 79 |     template <int VN, int N>
 80 |     void PreStepJoints(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd);
 81 |     template <int VN, int N>
 82 |     bool SolveJointsImpulses(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd, int iterationIndex);
 83 |     template <int VN, int N>
 84 |     bool SolveJointsDisplacement(ContactJointPacked<N>* joint_packed, int jointBegin, int jointEnd, int iterationIndex);
 85 | 
 86 |     struct SolveBodyParams
 87 |     {
 88 |         float invMass;
 89 |         float invInertia;
 90 | 
 91 |         Vector2f coords_pos;
 92 | 
 93 |         Vector2f coords_xVector;
 94 |         Vector2f coords_yVector;
 95 |     };
 96 | 
 97 |     struct SolveBody
 98 |     {
 99 |         Vector2f velocity;
100 |         float angularVelocity;
101 | 
102 |         int lastIteration;
103 |     };
104 | 
105 |     int islandCount;
106 |     int islandMaxSize;
107 | 
108 |     AlignedArray<SolveBodyParams> solveBodiesParams;
109 |     AlignedArray<SolveBody> solveBodiesImpulse;
110 |     AlignedArray<SolveBody> solveBodiesDisplacement;
111 | 
112 |     AlignedArray<ContactJoint> contactJoints;
113 | 
114 |     AlignedArray<int> jointGroup_bodies;
115 |     AlignedArray<int> jointGroup_joints;
116 | 
117 |     AlignedArray<int> joint_index;
118 | 
119 |     AlignedArray<int> island_remap;
120 |     AlignedArray<int> island_index;
121 |     AlignedArray<int> island_indexremap;
122 |     AlignedArray<int> island_offset;
123 |     AlignedArray<int> island_offsettemp;
124 |     AlignedArray<int> island_size;
125 | 
126 |     AlignedArray<ContactJointPacked<1>> joint_packed1;
127 |     AlignedArray<ContactJointPacked<4>> joint_packed4;
128 |     AlignedArray<ContactJointPacked<8>> joint_packed8;
129 | };


--------------------------------------------------------------------------------
/src/Vector2.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <limits>
  4 | #include <cmath>
  5 | 
  6 | template <typename T>
  7 | struct Vector2
  8 | {
  9 |     T x, y;
 10 |     T& operator[](const int i)
 11 |     {
 12 |         return *(&(x) + i);
 13 |     }
 14 |     inline Vector2<T>() {}
 15 |     //inline Vector3d(const Vector3d & rhs) { *this = rhs; }
 16 |     inline Vector2<T>(const T& _x, const T& _y)
 17 |         : x(_x)
 18 |         , y(_y)
 19 |     {
 20 |     }
 21 |     inline T Len() const
 22 |     {
 23 |         return sqrt(x * x + y * y);
 24 |     }
 25 | 
 26 |     inline T SquareLen() const
 27 |     {
 28 |         return x * x + y * y;
 29 |     }
 30 | 
 31 |     inline void Normalize() //Normalize itself
 32 |     {
 33 |         T l = sqrtf(x * x + y * y);
 34 |         if (fabs(l) > T(1e-8))
 35 |         {
 36 |             T k = T(1.0) / l;
 37 |             x *= k;
 38 |             y *= k;
 39 |         }
 40 |     }
 41 | 
 42 |     inline void Invert()
 43 |     {
 44 |         x = -x;
 45 |         y = -y;
 46 |     }
 47 | 
 48 |     void Rotate(const T angle)
 49 |     {
 50 |         Vector2 self = *this;
 51 |         Vector2 x = self;
 52 |         Vector2 y = Vector2(-x.y, x.x);
 53 |         Vector2 delta = x * cos(angle) + y * sin(angle) - x;
 54 |         self += delta;
 55 |         *this = self;
 56 |     }
 57 | 
 58 |     inline Vector2<T> GetNorm() const
 59 |     {
 60 |         T l = sqrt(x * x + y * y);
 61 |         if (fabs(l) > T(1e-8))
 62 |         {
 63 |             T k = T(1.0) / l;
 64 |             return Vector2<T>(x * k, y * k);
 65 |         }
 66 |         else
 67 |         {
 68 |             return Vector2<T>(0, 0);
 69 |         }
 70 |     }
 71 | 
 72 |     inline Vector2<T> operator-() const
 73 |     {
 74 |         return Vector2<T>(-x, -y);
 75 |     }
 76 | 
 77 |     void Decrease(T val)
 78 |     {
 79 |         if (SquareLen() > val * val)
 80 |         {
 81 |             T len = Len();
 82 |             T scale = (len - val) / len;
 83 |             x *= scale;
 84 |             y *= scale;
 85 |         }
 86 |         else
 87 |         {
 88 |             x = 0.0f;
 89 |             y = 0.0f;
 90 |         }
 91 |     }
 92 | 
 93 |     inline Vector2<T>& operator*=(const T& val)
 94 |     {
 95 |         x *= val;
 96 |         y *= val;
 97 |         return *this;
 98 |     }
 99 | 
100 |     inline Vector2<T>& operator/=(const T& val)
101 |     {
102 |         T inv = T(1.0) / val;
103 |         x *= inv;
104 |         y *= inv;
105 |         return *this;
106 |     }
107 |     inline Vector2<T>& operator+=(const Vector2<T>& vec)
108 |     {
109 |         x += vec.x;
110 |         y += vec.y;
111 |         return *this;
112 |     }
113 |     inline Vector2<T>& operator-=(const Vector2<T>& vec)
114 |     {
115 |         x -= vec.x;
116 |         y -= vec.y;
117 |         return *this;
118 |     }
119 |     inline Vector2<T>& operator--()
120 |     {
121 |         x = -x;
122 |         y = -y;
123 |         return *this;
124 |     }
125 | 
126 |     inline Vector2<T> operator+(const Vector2<T>& vec) const
127 |     {
128 |         return Vector2<T>(x + vec.x, y + vec.y);
129 |     }
130 |     inline Vector2<T> operator-(const Vector2<T>& vec) const
131 |     {
132 |         return Vector2<T>(x - vec.x, y - vec.y);
133 |     }
134 |     inline T operator*(const Vector2<T>& vec) const
135 |     {
136 |         return x * vec.x + y * vec.y;
137 |     }
138 |     inline Vector2<T> operator*(const T& val) const
139 |     {
140 |         return Vector2<T>(x * val, y * val);
141 |     }
142 |     Vector2<T> GetPerpendicular() const
143 |     {
144 |         return Vector2<T>(-y, x);
145 |     }
146 | 
147 |     template <typename SomeVector>
148 |     inline const Vector2<T> operator=(const SomeVector& v)
149 |     {
150 |         x = v.x;
151 |         y = v.y;
152 |         return *this;
153 |     }
154 | 
155 |     static const Vector2<T> zeroVector()
156 |     {
157 |         return Vector2<T>(0, 0);
158 |     }
159 | 
160 |     static const Vector2<T> zero()
161 |     {
162 |         return zeroVector();
163 |     }
164 | 
165 |     static const Vector2<T> one()
166 |     {
167 |         return Vector2<T>(T(1.0), T(1.0));
168 |     }
169 | 
170 |     static const Vector2<T> xAxis()
171 |     {
172 |         return Vector2<T>(T(1.0), 0);
173 |     }
174 | 
175 |     static const Vector2<T> yAxis()
176 |     {
177 |         return Vector2<T>(0, T(1.0));
178 |     }
179 | };
180 | 
181 | template <typename T>
182 | inline Vector2<T> operator*(const T& d, const Vector2<T>& V)
183 | {
184 |     return Vector2<T>(V.x * d, V.y * d);
185 | }
186 | 
187 | template <typename T>
188 | inline Vector2<T> operator/(const Vector2<T>& V, const T& d)
189 | {
190 |     T invd = T(1.0) / d;
191 |     return Vector2<T>(V.x * invd, V.y * invd);
192 | }
193 | 
194 | template <typename T>
195 | inline T operator^(const Vector2<T>& v1, const Vector2<T>& v2)
196 | {
197 |     return v1.x * v2.y - v1.y * v2.x;
198 | }
199 | 
200 | typedef Vector2<float> Vector2f;
201 | typedef Vector2<double> Vector2d;
202 | 
203 | const static Vector2f zeroVector2f = Vector2f(0, 0);
204 | const static Vector2f xAxis2f = Vector2f(1, 0);
205 | const static Vector2f yAxis2f = Vector2f(0, 1);
206 | 
207 | const static Vector2d zeroVector2d = Vector2d(0, 0);
208 | const static Vector2d xAxis2d = Vector2d(1, 0);
209 | const static Vector2d yAxis2d = Vector2d(0, 1);
210 | 
211 | template <typename T>
212 | bool GetTwoLinesIntersection(const Vector2<T>& p1, const Vector2<T>& p2, const Vector2<T>& t1, const Vector2<T>& t2, Vector2<T>& p0)
213 | {
214 |     Vector2<T> v1, v2;
215 |     T k1, k2;
216 |     v1 = p2 - p1;
217 |     v2 = t2 - t1;
218 |     T invmul;
219 |     T mul = v1 ^ v2;
220 |     if (fabs(mul) > T(1e-5))
221 |     {
222 |         invmul = 1.0f / (v1 ^ v2);
223 |         k2 = ((t1 ^ v1) - (p1 ^ v1)) * invmul; /*p1.x * v1.y - p1.y * v1.x + t1.y * v1.x - t1.x * v1.y*/
224 |         k1 = ((t1 ^ v2) - (p1 ^ v2)) * invmul;
225 |         p0 = p1 + v1 * k1;
226 |         //		Vector p02 = t1 + v2 * k2;
227 |         return ((k1 > 0.0f) && (k1 < 1.0f) && (k2 > 0.0f) && (k2 < 1.0f));
228 |     }
229 |     else
230 |     {
231 |         return 0;
232 |     }
233 |     p0 = t1 + (t1 - t2); //100% bad point
234 |     return 0;
235 | }
236 | 
237 | template <typename T>
238 | bool ProjectPointToLine(const Vector2<T>& t1, const Vector2<T>& t2, const Vector2<T>& p, Vector2<T>& p0, T& signOfSide)
239 | {
240 |     Vector2<T> v1 = p - t1;
241 |     Vector2<T> v2 = t2 - t1;
242 |     signOfSide = sgn(v1 ^ v2);
243 |     p0 = t1 + v2 * ((v1 * v2) / v2.SquareLen());
244 |     if ((v1 * v2 >= 0.0f) && ((v1 * v2) / (v2.SquareLen()) <= 1.0f))
245 |     {
246 |         return 1;
247 |     }
248 |     else
249 |     {
250 |         return 0;
251 |     }
252 | }
253 | 
254 | template <typename T>
255 | bool ProjectPointToLine(const Vector2<T>& t1, const Vector2<T>& t2, const Vector2<T>& p, Vector2<T>& p0)
256 | {
257 |     T signOfSide;
258 |     return ProjectPointToLine(t1, t2, p, p0, signOfSide);
259 | }
260 | 
261 | template <typename T>
262 | T PointToSegmentDistanse(const Vector2<T>& t1, const Vector2<T>& t2, const Vector2<T>& p)
263 | {
264 |     Vector2<T> p0;
265 |     T signOfSide;
266 |     if (ProjectPointToLine(t1, t2, p, p0, signOfSide))
267 |     {
268 |         return Vector2<T>(p.x - p0.x, p.y - p0.y).Len();
269 |     }
270 |     else
271 |     {
272 |         return Min(Vector2<T>(p.x - t1.x, p.y - t1.y).Len(),
273 |                    Vector2<T>(p.x - t2.x, p.y - t2.y).Len());
274 |     }
275 | }
276 | 
277 | template <typename T>
278 | void ProjectPointToLine(const Vector2<T>& point, const Vector2<T>& planePoint, const Vector2<T>& planeNormal, const Vector2<T>& projectionDirection,
279 |                         Vector2f& projectedPoint)
280 | {
281 |     float mult = 1.0f / (projectionDirection * planeNormal);
282 |     projectedPoint = point + projectionDirection * ((planePoint * planeNormal) - (point * planeNormal)) * mult;
283 | }
284 | 
285 | template <typename T>
286 | void ProjectPointToPlane(const Vector2<T>& point, const Vector2<T>& planePoint, const Vector2<T>& planeNormal, const Vector2<T>& projectionDirection,
287 |                          Vector2f& projectedPoint)
288 | {
289 |     ProjectPointToLine(point, planePoint, planeNormal, projectionDirection, projectedPoint);
290 | }
291 | 
292 | template <typename T>
293 | void ProjectPointToLine_noreturn(const Vector2<T>& t1, const Vector2<T>& t2, const Vector2<T>& p, Vector2<T>& p0, T& signOfSide)
294 | {
295 |     Vector2<T> v1 = p - t1;
296 |     Vector2<T> v2 = t2 - t1;
297 |     signOfSide = sgn(v1 ^ v2);
298 |     p0 = t1 + v2 * ((v1 * v2) / v2.SquareLen());
299 | }
300 | 
301 | template <typename GeomSpace>
302 | bool IsPointInCellEx(const typename GeomSpace::Vector2 points[3], typename GeomSpace::Vector2 testPoint, typename GeomSpace::Scalar eps = 0)
303 | {
304 |     typedef typename GeomSpace::Scalar Scalar;
305 | 
306 |     Scalar side0 = ((points[1] - points[0]) ^ (testPoint - points[0]));
307 |     Scalar side1 = ((points[2] - points[1]) ^ (testPoint - points[1]));
308 |     Scalar side2 = ((points[0] - points[2]) ^ (testPoint - points[2]));
309 | 
310 |     if (side0 >= -eps && side1 >= -eps && side2 >= -eps) return 1;
311 |     if (side0 <= eps && side1 <= eps && side2 <= eps) return 1;
312 |     return 0;
313 | }
314 | 
315 | template <typename GeomSpace>
316 | bool IsPointInCell(const typename GeomSpace::Vector2 points[3], typename GeomSpace::Vector2 testPoint)
317 | {
318 |     typedef typename GeomSpace::Scalar Scalar;
319 |     //Scalar //eps = 0;//-1e-4;//std::numeric_limits<float>::epsilon();//Scalar(1e-9);
320 |     Scalar eps = std::numeric_limits<Scalar>::epsilon();
321 | 
322 |     Scalar side0 = ((points[1] - points[0]) ^ (testPoint - points[0]));
323 |     Scalar side1 = ((points[2] - points[1]) ^ (testPoint - points[1]));
324 |     Scalar side2 = ((points[0] - points[2]) ^ (testPoint - points[2]));
325 | 
326 |     if (side0 >= -eps && side1 >= -eps && side2 >= -eps) return 1;
327 |     if (side0 <= eps && side1 <= eps && side2 <= eps) return 1;
328 |     return 0;
329 | 
330 |     /*Scalar eps = std::numeric_limits<float>::epsilon();//Scalar(1e-9);
331 |   Scalar side012 =  mixed_product(points[1] - points[0], points[2] - points[0], testPoint - points[0]) *
332 |                     mixed_product(points[1] - points[0], points[2] - points[0], points[3] - points[0]);
333 |   if(side012 < -eps) return 0;
334 | 
335 |   Scalar side123 =  mixed_product(points[1] - points[2], points[3] - points[2], testPoint - points[2]) *
336 |                     mixed_product(points[1] - points[2], points[3] - points[2], points[0] - points[2]);
337 |   if(side123 < -eps) return 0;
338 | 
339 |   Scalar side230 =  mixed_product(points[2] - points[3], points[0] - points[3], testPoint - points[3]) *
340 |                     mixed_product(points[2] - points[3], points[0] - points[3], points[1] - points[3]);
341 |   if(side230 < -eps) return 0;
342 | 
343 |   Scalar side013 =  mixed_product(points[0] - points[1], points[3] - points[1], testPoint - points[1]) *
344 |                     mixed_product(points[0] - points[1], points[3] - points[1], points[2] - points[1]);
345 |   if(side013 < -eps) return 0;
346 | 
347 |   return 1;*/
348 | 
349 |     /*Scalar side1 = mixed_product(points[2] - points[0], points[3] - points[0], testPoint - points[0]);
350 |   Scalar side2 = mixed_product(points[3] - points[0], points[1] - points[0], testPoint - points[0]);
351 |   Scalar side3 = mixed_product(points[3] - points[1], points[2] - points[1], testPoint - points[1]);
352 |   if (side0 * side1 < 0) return 0;
353 |   if (side1 * side2 < 0) return 0;
354 |   if (side2 * side3 < 0) return 0;
355 | 
356 |   return 1;*/
357 | }
358 | 


--------------------------------------------------------------------------------
/src/World.cpp:
--------------------------------------------------------------------------------
  1 | #include "World.h"
  2 | 
  3 | #include "base/Parallel.h"
  4 | #include "microprofile.h"
  5 | 
  6 | World::World()
  7 |     : gravity(0)
  8 | {
  9 | }
 10 | 
 11 | RigidBody* World::AddBody(Coords2f coords, Vector2f size)
 12 | {
 13 |     RigidBody newbie(coords, size, 1e-5f);
 14 |     newbie.index = bodies.size;
 15 |     bodies.push_back(newbie);
 16 |     return &(bodies[bodies.size - 1]);
 17 | }
 18 | 
 19 | void World::Update(WorkQueue& queue, float dt, const Configuration& configuration)
 20 | {
 21 |     MICROPROFILE_SCOPEI("Physics", "Update", 0x00ff00);
 22 | 
 23 |     collisionTime = mergeTime = solveTime = 0;
 24 | 
 25 |     IntegrateVelocity(queue, dt);
 26 | 
 27 |     collider.UpdateBroadphase(bodies.data, bodies.size);
 28 |     collider.UpdatePairs(queue, bodies.data, bodies.size);
 29 |     collider.UpdateManifolds(queue, bodies.data);
 30 |     collider.PackManifolds(bodies.data);
 31 | 
 32 |     RefreshContactJoints();
 33 | 
 34 |     solver.SolveJoints(queue, bodies.data, bodies.size, collider.contactPoints.data, configuration);
 35 | 
 36 |     IntegratePosition(queue, dt);
 37 | }
 38 | 
 39 | NOINLINE void World::IntegrateVelocity(WorkQueue& queue, float dt)
 40 | {
 41 |     MICROPROFILE_SCOPEI("Physics", "IntegrateVelocity", -1);
 42 | 
 43 |     parallelFor(queue, bodies.data, bodies.size, 32, [this, dt](RigidBody& body, int) {
 44 |         if (body.invMass > 0.0f)
 45 |         {
 46 |             body.acceleration.y += gravity;
 47 |         }
 48 | 
 49 |         body.velocity += body.acceleration * dt;
 50 |         body.acceleration = Vector2f(0.0f, 0.0f);
 51 | 
 52 |         body.angularVelocity += body.angularAcceleration * dt;
 53 |         body.angularAcceleration = 0.0f;
 54 |     });
 55 | }
 56 | 
 57 | NOINLINE void World::IntegratePosition(WorkQueue& queue, float dt)
 58 | {
 59 |     MICROPROFILE_SCOPEI("Physics", "IntegratePosition", -1);
 60 | 
 61 |     parallelFor(queue, bodies.data, bodies.size, 32, [dt](RigidBody& body, int) {
 62 |         body.coords.pos += body.displacingVelocity + body.velocity * dt;
 63 |         body.coords.Rotate(-(body.displacingAngularVelocity + body.angularVelocity * dt));
 64 | 
 65 |         body.displacingVelocity = Vector2f(0.0f, 0.0f);
 66 |         body.displacingAngularVelocity = 0.0f;
 67 | 
 68 |         body.UpdateGeom();
 69 |     });
 70 | }
 71 | 
 72 | NOINLINE void World::RefreshContactJoints()
 73 | {
 74 |     MICROPROFILE_SCOPEI("Physics", "RefreshContactJoints", -1);
 75 | 
 76 |     int matched = 0;
 77 |     int created = 0;
 78 |     int deleted = 0;
 79 | 
 80 |     {
 81 |         MICROPROFILE_SCOPEI("Physics", "Reset", -1);
 82 | 
 83 |         for (int jointIndex = 0; jointIndex < solver.contactJoints.size; jointIndex++)
 84 |         {
 85 |             solver.contactJoints[jointIndex].contactPointIndex = -1;
 86 |         }
 87 |     }
 88 | 
 89 |     {
 90 |         MICROPROFILE_SCOPEI("Physics", "Match", -1);
 91 | 
 92 |         for (int manifoldIndex = 0; manifoldIndex < collider.manifolds.size; ++manifoldIndex)
 93 |         {
 94 |             Manifold& man = collider.manifolds[manifoldIndex];
 95 | 
 96 |             for (int collisionIndex = 0; collisionIndex < man.pointCount; collisionIndex++)
 97 |             {
 98 |                 int contactPointIndex = man.pointIndex + collisionIndex;
 99 |                 ContactPoint& col = collider.contactPoints[contactPointIndex];
100 | 
101 |                 if (col.solverIndex < 0)
102 |                 {
103 |                     col.solverIndex = solver.contactJoints.size;
104 | 
105 |                     solver.contactJoints.push_back(ContactJoint(man.body1Index, man.body2Index, contactPointIndex));
106 | 
107 |                     created++;
108 |                 }
109 |                 else
110 |                 {
111 |                     ContactJoint& joint = solver.contactJoints[col.solverIndex];
112 | 
113 |                     assert(joint.body1Index == man.body1Index);
114 |                     assert(joint.body2Index == man.body2Index);
115 | 
116 |                     joint.contactPointIndex = contactPointIndex;
117 | 
118 |                     matched++;
119 |                 }
120 |             }
121 |         }
122 |     }
123 | 
124 |     {
125 |         MICROPROFILE_SCOPEI("Physics", "Cleanup", -1);
126 | 
127 |         for (int jointIndex = 0; jointIndex < solver.contactJoints.size;)
128 |         {
129 |             ContactJoint& joint = solver.contactJoints[jointIndex];
130 | 
131 |             if (joint.contactPointIndex < 0)
132 |             {
133 |                 joint = solver.contactJoints[solver.contactJoints.size - 1];
134 |                 solver.contactJoints.size--;
135 | 
136 |                 deleted++;
137 |             }
138 |             else
139 |             {
140 |                 collider.contactPoints[joint.contactPointIndex].solverIndex = jointIndex;
141 |                 jointIndex++;
142 |             }
143 |         }
144 |     }
145 | 
146 |     MICROPROFILE_META_CPU("Matched", matched);
147 |     MICROPROFILE_META_CPU("Created", created);
148 |     MICROPROFILE_META_CPU("Deleted", deleted);
149 | }


--------------------------------------------------------------------------------
/src/World.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "RigidBody.h"
 4 | #include "Collider.h"
 5 | #include "Solver.h"
 6 | 
 7 | struct Configuration;
 8 | 
 9 | struct World
10 | {
11 |     enum SolveMode
12 |     {
13 |         Solve_Scalar,
14 |         Solve_SSE2,
15 |         Solve_AVX2,
16 |     };
17 | 
18 |     World();
19 | 
20 |     RigidBody* AddBody(Coords2f coords, Vector2f size);
21 | 
22 |     void Update(WorkQueue& queue, float dt, const Configuration& configuration);
23 | 
24 |     NOINLINE void IntegrateVelocity(WorkQueue& queue, float dt);
25 |     NOINLINE void IntegratePosition(WorkQueue& queue, float dt);
26 |     NOINLINE void RefreshContactJoints();
27 | 
28 |     float collisionTime;
29 |     float mergeTime;
30 |     float solveTime;
31 | 
32 |     AlignedArray<RigidBody> bodies;
33 |     Collider collider;
34 |     Solver solver;
35 | 
36 |     float gravity;
37 | };


--------------------------------------------------------------------------------
/src/base/AlignedArray.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <xmmintrin.h>
  4 | #include <string.h>
  5 | 
  6 | template <typename T>
  7 | struct AlignedArray
  8 | {
  9 |     T* data;
 10 |     int size;
 11 |     int capacity;
 12 | 
 13 |     AlignedArray()
 14 |         : data(0)
 15 |         , size(0)
 16 |         , capacity(0)
 17 |     {
 18 |     }
 19 | 
 20 |     ~AlignedArray()
 21 |     {
 22 |         _mm_free(data);
 23 |     }
 24 | 
 25 |     AlignedArray(const AlignedArray&) = delete;
 26 |     AlignedArray& operator=(const AlignedArray&) = delete;
 27 | 
 28 |     AlignedArray(AlignedArray&& other)
 29 |     {
 30 |         data = other.data;
 31 |         size = other.size;
 32 |         capacity = other.capacity;
 33 | 
 34 |         other.data = 0;
 35 |         other.size = 0;
 36 |         other.capacity = 0;
 37 |     }
 38 | 
 39 |     AlignedArray& operator=(AlignedArray&& other)
 40 |     {
 41 |         _mm_free(data);
 42 | 
 43 |         data = other.data;
 44 |         size = other.size;
 45 |         capacity = other.capacity;
 46 | 
 47 |         other.data = 0;
 48 |         other.size = 0;
 49 |         other.capacity = 0;
 50 | 
 51 |         return *this;
 52 |     }
 53 | 
 54 |     T* begin()
 55 |     {
 56 |         return data;
 57 |     }
 58 | 
 59 |     T* end()
 60 |     {
 61 |         return data + size;
 62 |     }
 63 | 
 64 |     T& operator[](int i)
 65 |     {
 66 |         assert(i >= 0 && i < size);
 67 |         return data[i];
 68 |     }
 69 | 
 70 |     const T& operator[](int i) const
 71 |     {
 72 |         assert(i >= 0 && i < size);
 73 |         return data[i];
 74 |     }
 75 | 
 76 |     void push_back(const T& value)
 77 |     {
 78 |         if (size == capacity)
 79 |         {
 80 |             T copy = value;
 81 | 
 82 |             realloc(size + 1, true);
 83 | 
 84 |             data[size++] = copy;
 85 |         }
 86 |         else
 87 |         {
 88 |             data[size++] = value;
 89 |         }
 90 |     }
 91 | 
 92 |     void truncate(int newsize)
 93 |     {
 94 |         assert(newsize <= size);
 95 | 
 96 |         size = newsize;
 97 |     }
 98 | 
 99 |     void resize_copy(int newsize)
100 |     {
101 |         if (newsize > capacity)
102 |             realloc(newsize, true);
103 | 
104 |         size = newsize;
105 |     }
106 | 
107 |     void resize(int newsize)
108 |     {
109 |         if (newsize > capacity)
110 |             realloc(newsize, false);
111 | 
112 |         size = newsize;
113 |     }
114 | 
115 |     void realloc(int newsize, bool copy)
116 |     {
117 |         int newcapacity = capacity;
118 |         while (newcapacity < newsize)
119 |             newcapacity += newcapacity / 2 + 1;
120 | 
121 |         // Leave 32b padding at the end to avoid buffer overruns for fast SIMD code
122 |         T* newdata = static_cast<T*>(_mm_malloc(newcapacity * sizeof(T) + 32, 32));
123 | 
124 |         if (data)
125 |         {
126 |             if (copy)
127 |                 memcpy(newdata, data, size * sizeof(T));
128 |             _mm_free(data);
129 |         }
130 | 
131 |         data = newdata;
132 |         capacity = newcapacity;
133 |     }
134 | 
135 |     void clear()
136 |     {
137 |         size = 0;
138 |     }
139 | };
140 | 


--------------------------------------------------------------------------------
/src/base/DenseHash.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <cassert>
  5 | #include <utility>
  6 | #include <functional>
  7 | 
  8 | // Internal implementation of DenseHashSet and DenseHashMap
  9 | namespace detail
 10 | {
 11 |     template <typename Key, typename Item, typename Hash, typename Eq>
 12 |     class DenseHashTable
 13 |     {
 14 |     public:
 15 |         typedef typename std::vector<Item>::const_iterator const_iterator;
 16 | 
 17 |         DenseHashTable(size_t capacity, const Hash& hash, const Eq& eq)
 18 |             : filled(0)
 19 |             , hash(hash)
 20 |             , eq(eq)
 21 |         {
 22 |             if (capacity)
 23 |             {
 24 |                 items.reserve(capacity);
 25 |                 rehash(capacity);
 26 |             }
 27 |         }
 28 | 
 29 |         const_iterator begin() const
 30 |         {
 31 |             return items.begin();
 32 |         }
 33 | 
 34 |         const_iterator end() const
 35 |         {
 36 |             return items.end();
 37 |         }
 38 | 
 39 |         bool empty() const
 40 |         {
 41 |             return items.empty();
 42 |         }
 43 | 
 44 |         size_t size() const
 45 |         {
 46 |             return items.size();
 47 |         }
 48 | 
 49 |         size_t bucket_count() const
 50 |         {
 51 |             return buckets.size();
 52 |         }
 53 | 
 54 |         float load_factor() const
 55 |         {
 56 |             return buckets.empty() ? 0 : float(filled) / float(buckets.size());
 57 |         }
 58 | 
 59 |         void clear()
 60 |         {
 61 |             items.clear();
 62 |             buckets.clear();
 63 |             filled = 0;
 64 |         }
 65 | 
 66 |     protected:
 67 |         std::vector<Item> items;
 68 |         std::vector<int32_t> buckets;
 69 |         size_t filled; // number of non-empty buckets
 70 | 
 71 |         Hash hash;
 72 |         Eq eq;
 73 | 
 74 |         void rehash()
 75 |         {
 76 |             if (filled >= buckets.size() * 3 / 4)
 77 |             {
 78 |                 rehash(items.size() * 2);
 79 |             }
 80 |         }
 81 | 
 82 |         void rehash(size_t capacity)
 83 |         {
 84 |             size_t newbuckets = 16;
 85 |             while (newbuckets < capacity) newbuckets *= 2;
 86 | 
 87 |             size_t hashmod = newbuckets - 1;
 88 | 
 89 |             std::vector<int32_t>(newbuckets, -1).swap(buckets);
 90 | 
 91 |             for (size_t i = 0; i < items.size(); ++i)
 92 |             {
 93 |                 size_t bucket = hash(getKey(items[i])) & hashmod;
 94 | 
 95 |                 for (size_t probe = 0; probe <= hashmod; ++probe)
 96 |                 {
 97 |                     if (buckets[bucket] < 0)
 98 |                     {
 99 |                         buckets[bucket] = i;
100 |                         break;
101 |                     }
102 | 
103 |                     // Hash collision, quadratic probing
104 |                     bucket = (bucket + probe + 1) & hashmod;
105 |                 }
106 |             }
107 | 
108 |             filled = items.size();
109 |         }
110 | 
111 |         int find_bucket(const Key& key) const
112 |         {
113 |             if (buckets.empty())
114 |                 return -1;
115 | 
116 |             size_t hashmod = buckets.size() - 1;
117 |             size_t bucket = hash(key) & hashmod;
118 | 
119 |             for (size_t probe = 0; probe <= hashmod; ++probe)
120 |             {
121 |                 int32_t probe_index = buckets[bucket];
122 | 
123 |                 // Element does not exist, insert here
124 |                 if (probe_index == -1)
125 |                     return -1;
126 | 
127 |                 // Not a tombstone and key matches
128 |                 if (probe_index >= 0 && eq(getKey(items[probe_index]), key))
129 |                     return bucket;
130 | 
131 |                 // Hash collision, quadratic probing
132 |                 bucket = (bucket + probe + 1) & hashmod;
133 |             }
134 | 
135 |             // Hash table is full - this should not happen
136 |             assert(false);
137 |             return -1;
138 |         }
139 | 
140 |         std::pair<Item*, bool> insert_item(const Key& key)
141 |         {
142 |             assert(!buckets.empty());
143 | 
144 |             size_t hashmod = buckets.size() - 1;
145 |             size_t bucket = hash(key) & hashmod;
146 | 
147 |             for (size_t probe = 0; probe <= hashmod; ++probe)
148 |             {
149 |                 int32_t probe_index = buckets[bucket];
150 | 
151 |                 // Element does not exist or a tombstone, insert here
152 |                 // TODO: this is incorrect! we have to follow the chain of tombstones to the end just in case our element does exist :(
153 |                 if (probe_index < 0)
154 |                 {
155 |                     buckets[bucket] = items.size();
156 |                     filled += probe_index == -1;
157 | 
158 |                     items.push_back(Item());
159 |                     getKey(items.back()) = key;
160 | 
161 |                     return std::make_pair(&items.back(), true);
162 |                 }
163 | 
164 |                 // Key matches, insert here
165 |                 if (eq(getKey(items[probe_index]), key))
166 |                     return std::make_pair(&items[probe_index], false);
167 | 
168 |                 // Hash collision, quadratic probing
169 |                 bucket = (bucket + probe + 1) & hashmod;
170 |             }
171 | 
172 |             // Hash table is full - this should not happen
173 |             assert(false);
174 |             return std::make_pair(static_cast<Item*>(0), false);
175 |         }
176 | 
177 |         void erase_bucket(int bucket)
178 |         {
179 |             assert(bucket >= 0);
180 | 
181 |             int32_t probe_index = buckets[bucket];
182 |             assert(probe_index >= 0);
183 | 
184 |             // move last key
185 |             // TODO: this is suboptimal! we don't need to compare keys when searching for this, it's enough to find a bucket that points to items.size()-1
186 |             int probe_bucket = find_bucket(getKey(items.back()));
187 |             assert(probe_bucket >= 0);
188 |             assert(buckets[probe_bucket] == int32_t(items.size() - 1));
189 | 
190 |             items[probe_index] = items.back();
191 |             buckets[probe_bucket] = probe_index;
192 | 
193 |             items.pop_back();
194 | 
195 |             // mark bucket as tombstone
196 |             buckets[bucket] = -2;
197 |         }
198 | 
199 |     private:
200 |         // Interface to support both key and pair<key, value>
201 |         static const Key& getKey(const Key& item) { return item; }
202 |         static Key& getKey(Key& item) { return item; }
203 |         template <typename Value> static const Key& getKey(const std::pair<Key, Value>& item) { return item.first; }
204 |         template <typename Value> static Key& getKey(std::pair<Key, Value>& item) { return item.first; }
205 |     };
206 | }
207 | 
208 | template <typename Key, typename Hash = std::hash<Key>, typename Eq = std::equal_to<Key>>
209 | class DenseHashSet: public detail::DenseHashTable<Key, Key, Hash, Eq>
210 | {
211 | public:
212 |     explicit DenseHashSet(size_t capacity = 0, const Hash& hash = Hash(), const Eq& eq = Eq())
213 |         : detail::DenseHashTable<Key, Key, Hash, Eq>(capacity, hash, eq)
214 |     {
215 |     }
216 | 
217 |     bool contains(const Key& key) const
218 |     {
219 |         return this->find_bucket(key) >= 0;
220 |     }
221 | 
222 |     bool insert(const Key& key)
223 |     {
224 |         this->rehash();
225 | 
226 |         return this->insert_item(key).second;
227 |     }
228 | 
229 |     void erase(const Key& key)
230 |     {
231 |         int bucket = this->find_bucket(key);
232 | 
233 |         if (bucket >= 0)
234 |             this->erase_bucket(bucket);
235 |     }
236 | };
237 | 
238 | // This is a faster alternative of std::unordered_map, but it does not implement the same interface (i.e. it does not support erasing and has contains() instead of find())
239 | template <typename Key, typename Value, typename Hash = std::hash<Key>, typename Eq = std::equal_to<Key>>
240 | class DenseHashMap: public detail::DenseHashTable<Key, std::pair<Key, Value>, Hash, Eq>
241 | {
242 | public:
243 |     explicit DenseHashMap(size_t capacity = 0, const Hash& hash = Hash(), const Eq& eq = Eq())
244 |         : detail::DenseHashTable<Key, std::pair<Key, Value>, Hash, Eq>(capacity, hash, eq)
245 |     {
246 |     }
247 | 
248 |     const Value* find(const Key& key) const
249 |     {
250 |         int bucket = this->find_bucket(key);
251 | 
252 |         return bucket < 0 ? NULL : &this->items[this->buckets[bucket]].second;
253 |     }
254 | 
255 |     Value& operator[](const Key& key)
256 |     {
257 |         this->rehash();
258 | 
259 |         return this->insert_item(key).first->second;
260 |     }
261 | 
262 |     void erase(const Key& key)
263 |     {
264 |         int bucket = this->find_bucket(key);
265 | 
266 |         if (bucket >= 0)
267 |             this->erase_bucket(bucket);
268 |     }
269 | };
270 | 


--------------------------------------------------------------------------------
/src/base/Parallel.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "WorkQueue.h"
  4 | 
  5 | #include <atomic>
  6 | 
  7 | #include "microprofile.h"
  8 | 
  9 | template <typename T>
 10 | inline T& parallelForIndex(T* data, unsigned int index)
 11 | {
 12 |     return data[index];
 13 | }
 14 | 
 15 | inline unsigned int parallelForIndex(unsigned int data, unsigned int index)
 16 | {
 17 |     return data + index;
 18 | }
 19 | 
 20 | template <typename T, typename F>
 21 | inline void serialFor(WorkQueue& queue, T data, unsigned int count, unsigned int groupSize, F func)
 22 | {
 23 |     for (unsigned int i = 0; i < count; ++i)
 24 |         func(parallelForIndex(data, i), 0);
 25 | }
 26 | 
 27 | template <typename T, typename F>
 28 | inline void parallelFor(WorkQueue& queue, T data, unsigned int count, unsigned int groupSize, F func)
 29 | {
 30 |     if (queue.getWorkerCount() == 0 || count <= groupSize)
 31 |     {
 32 |         for (unsigned int i = 0; i < count; ++i)
 33 |             func(parallelForIndex(data, i), 0);
 34 | 
 35 |         return;
 36 |     }
 37 | 
 38 |     MICROPROFILE_SCOPEI("WorkQueue", "ParallelFor", 0x808080);
 39 | 
 40 |     struct Item: WorkQueue::Item
 41 |     {
 42 |         WorkQueue* queue;
 43 | 
 44 |         std::atomic<unsigned int> counter;
 45 |         std::atomic<unsigned int> ready;
 46 | 
 47 |         T data;
 48 |         unsigned int count;
 49 | 
 50 |         unsigned int groupSize;
 51 |         unsigned int groupCount;
 52 | 
 53 |         F* func;
 54 | 
 55 |         Item(): counter(0), ready(0)
 56 |         {
 57 |         }
 58 | 
 59 |         void run(int worker) override
 60 |         {
 61 |             unsigned int groups = 0;
 62 | 
 63 |             for (;;)
 64 |             {
 65 |                 unsigned int groupIndex = counter.fetch_add(1);
 66 |                 if (groupIndex >= groupCount) break;
 67 | 
 68 |                 unsigned int begin = groupIndex * groupSize;
 69 |                 unsigned int end = std::min(count, begin + groupSize);
 70 | 
 71 |                 for (unsigned int i = begin; i < end; ++i)
 72 |                     (*func)(parallelForIndex(data, i), worker);
 73 | 
 74 |                 groups++;
 75 |             }
 76 | 
 77 |             ready.fetch_add(groups);
 78 |         }
 79 |     };
 80 | 
 81 |     auto item = std::make_shared<Item>();
 82 | 
 83 |     item->queue = &queue;
 84 |     item->data = data;
 85 |     item->count = count;
 86 |     item->groupSize = groupSize;
 87 |     item->groupCount = (count + groupSize - 1) / groupSize;
 88 |     item->func = &func;
 89 | 
 90 |     int optimalWorkerCount = std::min(unsigned(queue.getWorkerCount()), item->groupCount - 1);
 91 | 
 92 |     queue.pushItem(item, optimalWorkerCount);
 93 | 
 94 |     item->run(queue.getWorkerCount());
 95 | 
 96 |     if (item->ready.load() < item->groupCount)
 97 |     {
 98 |         MICROPROFILE_SCOPEI("WorkQueue", "Wait", 0xff0000);
 99 | 
100 |         do std::this_thread::yield();
101 |         while (item->ready.load() < item->groupCount);
102 |     }
103 | }


--------------------------------------------------------------------------------
/src/base/RadixSort.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | inline unsigned int radixUnsignedInt(unsigned int v)
  4 | {
  5 | 	return v;
  6 | }
  7 | 
  8 | inline unsigned int radixInt(int v)
  9 | {
 10 | 	// flip sign bit
 11 | 	return static_cast<unsigned int>(v) ^ 0x80000000;
 12 | }
 13 | 
 14 | inline unsigned int radixUnsignedFloat(const float& v)
 15 | {
 16 | 	return *reinterpret_cast<const unsigned int*>(&v);
 17 | }
 18 | 
 19 | inline unsigned int radixFloat(const float& v)
 20 | {
 21 | 	// if sign bit is 0, flip sign bit
 22 | 	// if sign bit is 1, flip everything
 23 | 	int f = *reinterpret_cast<const int*>(&v);
 24 | 	unsigned int mask = int(f >> 31) | 0x80000000;
 25 | 	return f ^ mask;
 26 | }
 27 | 
 28 | template <typename T, typename Pred> inline T* radixSort3(T* e0, T* e1, size_t count, Pred pred)
 29 | {
 30 | 	unsigned int h[2048*3];
 31 | 
 32 | 	for (size_t i = 0; i < 2048*3; ++i) h[i] = 0;
 33 | 
 34 | 	unsigned int* h0 = h;
 35 | 	unsigned int* h1 = h + 2048;
 36 | 	unsigned int* h2 = h + 2048*2;
 37 | 
 38 | 	T* e0_end = e0 + count;
 39 | 
 40 | 	#define _0(h) ((h) & 2047)
 41 | 	#define _1(h) (((h) >> 11) & 2047)
 42 | 	#define _2(h) ((h) >> 22)
 43 | 
 44 | 	// fill histogram
 45 | 	for (const T* i = e0; i != e0_end; ++i)
 46 | 	{
 47 | 		unsigned int h = pred(*i);
 48 | 
 49 | 		h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++;
 50 | 	}
 51 | 
 52 | 	// compute offsets
 53 | 	{
 54 | 		unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
 55 | 
 56 | 		for (unsigned int i = 0; i < 2048; ++i)
 57 | 		{
 58 | 			unsigned int c0 = h0[i];
 59 | 			unsigned int c1 = h1[i];
 60 | 			unsigned int c2 = h2[i];
 61 | 
 62 | 			h0[i] = sum0;
 63 | 			h1[i] = sum1;
 64 | 			h2[i] = sum2;
 65 | 
 66 | 			sum0 += c0;
 67 | 			sum1 += c1;
 68 | 			sum2 += c2;
 69 | 		}
 70 | 	}
 71 | 
 72 | 	for (size_t i = 0; i < count; ++i)
 73 | 	{
 74 | 		unsigned int h = pred(e0[i]);
 75 | 		e1[h0[_0(h)]++] = e0[i];
 76 | 	}
 77 | 
 78 | 	for (size_t i = 0; i < count; ++i)
 79 | 	{
 80 | 		unsigned int h = pred(e1[i]);
 81 | 		e0[h1[_1(h)]++] = e1[i];
 82 | 	}
 83 | 
 84 | 	for (size_t i = 0; i < count; ++i)
 85 | 	{
 86 | 		unsigned int h = pred(e0[i]);
 87 | 		e1[h2[_2(h)]++] = e0[i];
 88 | 	}
 89 | 
 90 | 	#undef _0
 91 | 	#undef _1
 92 | 	#undef _2
 93 | 
 94 | 	return e1;
 95 | }
 96 | 
 97 | template <typename T, typename Pred> inline T* radixSort4(T* e0, T* e1, size_t count, Pred pred)
 98 | {
 99 | 	unsigned int h[256*4];
100 | 
101 | 	for (size_t i = 0; i < 256*4; ++i) h[i] = 0;
102 | 
103 | 	unsigned int* h0 = h;
104 | 	unsigned int* h1 = h + 256;
105 | 	unsigned int* h2 = h + 256*2;
106 | 	unsigned int* h3 = h + 256*3;
107 | 
108 | 	T* e0_end = e0 + count;
109 | 
110 | 	#define _0(h) ((h) & 255)
111 | 	#define _1(h) (((h) >> 8) & 255)
112 | 	#define _2(h) (((h) >> 16) & 255)
113 | 	#define _3(h) ((h) >> 24)
114 | 
115 | 	// fill histogram
116 | 	for (const T* i = e0; i != e0_end; ++i)
117 | 	{
118 | 		unsigned int h = pred(*i);
119 | 
120 | 		h0[_0(h)]++; h1[_1(h)]++; h2[_2(h)]++; h3[_3(h)]++;
121 | 	}
122 | 
123 | 	// compute offsets
124 | 	{
125 | 		unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
126 | 
127 | 		for (unsigned int i = 0; i < 256; ++i)
128 | 		{
129 | 			unsigned int c0 = h0[i];
130 | 			unsigned int c1 = h1[i];
131 | 			unsigned int c2 = h2[i];
132 | 			unsigned int c3 = h3[i];
133 | 
134 | 			h0[i] = sum0;
135 | 			h1[i] = sum1;
136 | 			h2[i] = sum2;
137 | 			h3[i] = sum3;
138 | 
139 | 			sum0 += c0;
140 | 			sum1 += c1;
141 | 			sum2 += c2;
142 | 			sum3 += c3;
143 | 		}
144 | 	}
145 | 
146 | 	for (size_t i = 0; i < count; ++i)
147 | 	{
148 | 		unsigned int h = pred(e0[i]);
149 | 		e1[h0[_0(h)]++] = e0[i];
150 | 	}
151 | 
152 | 	for (size_t i = 0; i < count; ++i)
153 | 	{
154 | 		unsigned int h = pred(e1[i]);
155 | 		e0[h1[_1(h)]++] = e1[i];
156 | 	}
157 | 
158 | 	for (size_t i = 0; i < count; ++i)
159 | 	{
160 | 		unsigned int h = pred(e0[i]);
161 | 		e1[h2[_2(h)]++] = e0[i];
162 | 	}
163 | 
164 | 	for (size_t i = 0; i < count; ++i)
165 | 	{
166 | 		unsigned int h = pred(e1[i]);
167 | 		e0[h3[_3(h)]++] = e1[i];
168 | 	}
169 | 
170 | 	#undef _0
171 | 	#undef _1
172 | 	#undef _2
173 | 	#undef _3
174 | 
175 | 	return e0;
176 | }


--------------------------------------------------------------------------------
/src/base/SIMD.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <immintrin.h>
 4 | 
 5 | #ifdef _MSC_VER
 6 | #define SIMD_INLINE __forceinline
 7 | #define SIMD_ALIGN(n) __declspec(align(n))
 8 | #else
 9 | #define SIMD_INLINE __attribute__((always_inline)) inline
10 | #define SIMD_ALIGN(n) __attribute__((aligned(n)))
11 | #endif
12 | 
13 | namespace simd
14 | {
15 | 	template <int N> struct VNf_;
16 | 	template <int N> struct VNi_;
17 | 	template <int N> struct VNb_;
18 | 
19 | 	template <int N> using VNf = typename VNf_<N>::type;
20 | 	template <int N> using VNi = typename VNi_<N>::type;
21 | 	template <int N> using VNb = typename VNb_<N>::type;
22 | 
23 | 	template <typename T> void dump(const char* name, const T& v)
24 | 	{
25 | 		printf("%s:", name);
26 | 
27 | 		const float* fptr = reinterpret_cast<const float*>(&v);
28 | 		const int* iptr = reinterpret_cast<const int*>(&v);
29 | 
30 | 		for (size_t i = 0; i < sizeof(v) / 4; ++i)
31 | 			printf(" %f [%08x]", fptr[i], iptr[i]);
32 | 
33 | 		printf("\n");
34 | 	}
35 | }
36 | 
37 | #define SIMD_DUMP(v) simd::dump(#v, v)
38 | 
39 | #include "SIMD_Scalar.h"
40 | 
41 | #ifdef __SSE2__
42 | #include "SIMD_SSE2.h"
43 | #endif
44 | 
45 | #ifdef __AVX2__
46 | #include "SIMD_AVX2.h"
47 | #endif


--------------------------------------------------------------------------------
/src/base/SIMD_AVX2.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "SIMD_AVX2_Transpose.h"
  4 | 
  5 | namespace simd
  6 | {
  7 | 	struct V8f
  8 | 	{
  9 | 		__m256 v;
 10 | 
 11 | 		SIMD_INLINE V8f()
 12 | 		{
 13 | 		}
 14 | 
 15 | 		SIMD_INLINE V8f(__m256 v): v(v)
 16 | 		{
 17 | 		}
 18 | 
 19 | 		SIMD_INLINE operator __m256() const
 20 | 		{
 21 | 			return v;
 22 | 		}
 23 | 
 24 | 		SIMD_INLINE static V8f zero()
 25 | 		{
 26 | 			return _mm256_setzero_ps();
 27 | 		}
 28 | 
 29 | 		SIMD_INLINE static V8f one(float v)
 30 | 		{
 31 | 			return _mm256_set1_ps(v);
 32 | 		}
 33 | 
 34 | 		SIMD_INLINE static V8f sign()
 35 | 		{
 36 | 			return _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 37 | 		}
 38 | 
 39 | 		SIMD_INLINE static V8f load(const float* ptr)
 40 | 		{
 41 | 			return _mm256_load_ps(ptr);
 42 | 		}
 43 | 	};
 44 | 
 45 | 	struct V8i
 46 | 	{
 47 | 		__m256i v;
 48 | 
 49 | 		SIMD_INLINE V8i()
 50 | 		{
 51 | 		}
 52 | 
 53 | 		SIMD_INLINE V8i(__m256i v): v(v)
 54 | 		{
 55 | 		}
 56 | 
 57 | 		SIMD_INLINE operator __m256i() const
 58 | 		{
 59 | 			return v;
 60 | 		}
 61 | 
 62 | 		SIMD_INLINE static V8i zero()
 63 | 		{
 64 | 			return _mm256_setzero_si256();
 65 | 		}
 66 | 
 67 | 		SIMD_INLINE static V8i one(int v)
 68 | 		{
 69 | 			return _mm256_set1_epi32(v);
 70 | 		}
 71 | 
 72 | 		SIMD_INLINE static V8i load(const int* ptr)
 73 | 		{
 74 | 			return _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr));
 75 | 		}
 76 | 	};
 77 | 
 78 | 	struct V8b
 79 | 	{
 80 | 		__m256 v;
 81 | 
 82 | 		SIMD_INLINE V8b()
 83 | 		{
 84 | 		}
 85 | 
 86 | 		SIMD_INLINE V8b(__m256 v): v(v)
 87 | 		{
 88 | 		}
 89 | 
 90 | 		SIMD_INLINE V8b(__m256i v): v(_mm256_castsi256_ps(v))
 91 | 		{
 92 | 		}
 93 | 
 94 | 		SIMD_INLINE operator __m256() const
 95 | 		{
 96 | 			return v;
 97 | 		}
 98 | 
 99 | 		SIMD_INLINE static V8b zero()
100 | 		{
101 | 			return _mm256_setzero_ps();
102 | 		}
103 | 	};
104 | 
105 | 	SIMD_INLINE V8i bitcast(V8f v)
106 | 	{
107 | 		return _mm256_castps_si256(v.v);
108 | 	}
109 | 
110 | 	SIMD_INLINE V8f bitcast(V8i v)
111 | 	{
112 | 		return _mm256_castsi256_ps(v.v);
113 | 	}
114 | 
115 | 	SIMD_INLINE V8f operator+(V8f v)
116 | 	{
117 | 		return v;
118 | 	}
119 | 
120 | 	SIMD_INLINE V8f operator-(V8f v)
121 | 	{
122 | 		return _mm256_xor_ps(V8f::sign(), v.v);
123 | 	}
124 | 
125 | 	SIMD_INLINE V8f operator+(V8f l, V8f r)
126 | 	{
127 | 		return _mm256_add_ps(l.v, r.v);
128 | 	}
129 | 
130 | 	SIMD_INLINE V8f operator-(V8f l, V8f r)
131 | 	{
132 | 		return _mm256_sub_ps(l.v, r.v);
133 | 	}
134 | 
135 | 	SIMD_INLINE V8f operator*(V8f l, V8f r)
136 | 	{
137 | 		return _mm256_mul_ps(l.v, r.v);
138 | 	}
139 | 
140 | 	SIMD_INLINE V8f operator/(V8f l, V8f r)
141 | 	{
142 | 		return _mm256_div_ps(l.v, r.v);
143 | 	}
144 | 
145 | 	SIMD_INLINE void operator+=(V8f& l, V8f r)
146 | 	{
147 | 		l.v = _mm256_add_ps(l.v, r.v);
148 | 	}
149 | 
150 | 	SIMD_INLINE void operator-=(V8f& l, V8f r)
151 | 	{
152 | 		l.v = _mm256_sub_ps(l.v, r.v);
153 | 	}
154 | 
155 | 	SIMD_INLINE void operator*=(V8f& l, V8f r)
156 | 	{
157 | 		l.v = _mm256_mul_ps(l.v, r.v);
158 | 	}
159 | 
160 | 	SIMD_INLINE void operator/=(V8f& l, V8f r)
161 | 	{
162 | 		l.v = _mm256_div_ps(l.v, r.v);
163 | 	}
164 | 
165 | 	SIMD_INLINE V8b operator==(V8f l, V8f r)
166 | 	{
167 | 		return _mm256_cmp_ps(l.v, r.v, _CMP_EQ_UQ);
168 | 	}
169 | 
170 | 	SIMD_INLINE V8b operator==(V8i l, V8i r)
171 | 	{
172 | 		return _mm256_cmpeq_epi32(l.v, r.v);
173 | 	}
174 | 
175 | 	SIMD_INLINE V8b operator!=(V8f l, V8f r)
176 | 	{
177 | 		return _mm256_cmp_ps(l.v, r.v, _CMP_NEQ_UQ);
178 | 	}
179 | 
180 | 	SIMD_INLINE V8b operator!=(V8i l, V8i r)
181 | 	{
182 | 		return _mm256_xor_si256(_mm256_setzero_si256(), _mm256_cmpeq_epi32(l.v, r.v));
183 | 	}
184 | 
185 | 	SIMD_INLINE V8b operator<(V8f l, V8f r)
186 | 	{
187 | 		return _mm256_cmp_ps(l.v, r.v, _CMP_LT_OQ);
188 | 	}
189 | 
190 | 	SIMD_INLINE V8b operator<(V8i l, V8i r)
191 | 	{
192 | 		return _mm256_cmpgt_epi32(r.v, l.v);
193 | 	}
194 | 
195 | 	SIMD_INLINE V8b operator<=(V8f l, V8f r)
196 | 	{
197 | 		return _mm256_cmp_ps(l.v, r.v, _CMP_LE_OQ);
198 | 	}
199 | 
200 | 	SIMD_INLINE V8b operator<=(V8i l, V8i r)
201 | 	{
202 | 		return _mm256_xor_si256(_mm256_setzero_si256(), _mm256_cmpgt_epi32(l.v, r.v));
203 | 	}
204 | 
205 | 	SIMD_INLINE V8b operator>(V8f l, V8f r)
206 | 	{
207 | 		return _mm256_cmp_ps(l.v, r.v, _CMP_GT_OQ);
208 | 	}
209 | 
210 | 	SIMD_INLINE V8b operator>(V8i l, V8i r)
211 | 	{
212 | 		return _mm256_cmpgt_epi32(l.v, r.v);
213 | 	}
214 | 
215 | 	SIMD_INLINE V8b operator>=(V8f l, V8f r)
216 | 	{
217 | 		return _mm256_cmp_ps(l.v, r.v, _CMP_GE_OQ);
218 | 	}
219 | 
220 | 	SIMD_INLINE V8b operator>=(V8i l, V8i r)
221 | 	{
222 | 		return _mm256_xor_si256(_mm256_setzero_si256(), _mm256_cmpgt_epi32(r.v, l.v));
223 | 	}
224 | 
225 | 	SIMD_INLINE V8b operator!(V8b v)
226 | 	{
227 | 		return _mm256_xor_ps(_mm256_setzero_ps(), v.v);
228 | 	}
229 | 
230 | 	SIMD_INLINE V8b operator&(V8b l, V8b r)
231 | 	{
232 | 		return _mm256_and_ps(l.v, r.v);
233 | 	}
234 | 
235 | 	SIMD_INLINE V8b operator|(V8b l, V8b r)
236 | 	{
237 | 		return _mm256_or_ps(l.v, r.v);
238 | 	}
239 | 
240 | 	SIMD_INLINE V8b operator^(V8b l, V8b r)
241 | 	{
242 | 		return _mm256_xor_ps(l.v, r.v);
243 | 	}
244 | 
245 | 	SIMD_INLINE void operator&=(V8b& l, V8b r)
246 | 	{
247 | 		l.v = _mm256_and_ps(l.v, r.v);
248 | 	}
249 | 
250 | 	SIMD_INLINE void operator|=(V8b& l, V8b r)
251 | 	{
252 | 		l.v = _mm256_or_ps(l.v, r.v);
253 | 	}
254 | 
255 | 	SIMD_INLINE void operator^=(V8b& l, V8b r)
256 | 	{
257 | 		l.v = _mm256_xor_ps(l.v, r.v);
258 | 	}
259 | 
260 | 	SIMD_INLINE V8f abs(V8f v)
261 | 	{
262 | 		return _mm256_andnot_ps(V8f::sign(), v.v);
263 | 	}
264 | 
265 | 	SIMD_INLINE V8f copysign(V8f x, V8f y)
266 | 	{
267 | 		V8f sign = V8f::sign();
268 | 
269 | 		return _mm256_or_ps(_mm256_andnot_ps(sign.v, x.v), _mm256_and_ps(y.v, sign.v));
270 | 	}
271 | 
272 | 	SIMD_INLINE V8f flipsign(V8f x, V8f y)
273 | 	{
274 | 		return _mm256_xor_ps(x.v, _mm256_and_ps(y.v, V8f::sign()));
275 | 	}
276 | 
277 | 	SIMD_INLINE V8f min(V8f l, V8f r)
278 | 	{
279 | 		return _mm256_min_ps(l.v, r.v);
280 | 	}
281 | 
282 | 	SIMD_INLINE V8f max(V8f l, V8f r)
283 | 	{
284 | 		return _mm256_max_ps(l.v, r.v);
285 | 	}
286 | 
287 | 	SIMD_INLINE V8f select(V8f l, V8f r, V8b m)
288 | 	{
289 | 		return _mm256_blendv_ps(l.v, r.v, m.v);
290 | 	}
291 | 
292 | 	SIMD_INLINE V8i select(V8i l, V8i r, V8b m)
293 | 	{
294 | 		__m256i mi = _mm256_castps_si256(m.v);
295 | 
296 | 		return _mm256_blendv_epi8(l.v, r.v, mi);
297 | 	}
298 | 
299 | 	SIMD_INLINE bool none(V8b v)
300 | 	{
301 | 		return _mm256_movemask_ps(v.v) == 0;
302 | 	}
303 | 
304 | 	SIMD_INLINE bool any(V8b v)
305 | 	{
306 | 		return _mm256_movemask_ps(v.v) != 0;
307 | 	}
308 | 
309 | 	SIMD_INLINE bool all(V8b v)
310 | 	{
311 | 		return _mm256_movemask_ps(v.v) == 31;
312 | 	}
313 | 
314 | 	SIMD_INLINE void store(V8f v, float* ptr)
315 | 	{
316 | 		_mm256_store_ps(ptr, v.v);
317 | 	}
318 | 
319 | 	SIMD_INLINE void store(V8i v, int* ptr)
320 | 	{
321 | 		_mm256_store_si256(reinterpret_cast<__m256i*>(ptr), v.v);
322 | 	}
323 | 
324 | 	SIMD_INLINE void loadindexed4(V8f& v0, V8f& v1, V8f& v2, V8f& v3, const void* base, const int indices[8], unsigned int stride)
325 | 		{
326 | 		const char* ptr = static_cast<const char*>(base);
327 | 
328 | 		__m128 hr0 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[0] * stride));
329 | 		__m128 hr1 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[1] * stride));
330 | 		__m128 hr2 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[2] * stride));
331 | 		__m128 hr3 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[3] * stride));
332 | 		__m128 hr4 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[4] * stride));
333 | 		__m128 hr5 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[5] * stride));
334 | 		__m128 hr6 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[6] * stride));
335 | 		__m128 hr7 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[7] * stride));
336 | 
337 | 		__m256 r0 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr0), hr4, 1);
338 | 		__m256 r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr1), hr5, 1);
339 | 		__m256 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr2), hr6, 1);
340 | 		__m256 r3 = _mm256_insertf128_ps(_mm256_castps128_ps256(hr3), hr7, 1);
341 | 
342 | 		_MM_TRANSPOSE8_LANE4_PS(r0, r1, r2, r3);
343 | 
344 | 		v0.v = r0;
345 | 		v1.v = r1;
346 | 		v2.v = r2;
347 | 		v3.v = r3;
348 | 	}
349 | 
350 | 	SIMD_INLINE void storeindexed4(const V8f& v0, const V8f& v1, const V8f& v2, const V8f& v3, void* base, const int indices[8], unsigned int stride)
351 | 	{
352 | 		char* ptr = static_cast<char*>(base);
353 | 
354 | 		__m256 r0 = v0.v;
355 | 		__m256 r1 = v1.v;
356 | 		__m256 r2 = v2.v;
357 | 		__m256 r3 = v3.v;
358 | 
359 | 		_MM_TRANSPOSE8_LANE4_PS(r0, r1, r2, r3);
360 | 
361 | 		__m128 hr0 = _mm256_castps256_ps128(r0);
362 | 		__m128 hr1 = _mm256_castps256_ps128(r1);
363 | 		__m128 hr2 = _mm256_castps256_ps128(r2);
364 | 		__m128 hr3 = _mm256_castps256_ps128(r3);
365 | 		__m128 hr4 = _mm256_extractf128_ps(r0, 1);
366 | 		__m128 hr5 = _mm256_extractf128_ps(r1, 1);
367 | 		__m128 hr6 = _mm256_extractf128_ps(r2, 1);
368 | 		__m128 hr7 = _mm256_extractf128_ps(r3, 1);
369 | 
370 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[0] * stride), hr0);
371 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[1] * stride), hr1);
372 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[2] * stride), hr2);
373 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[3] * stride), hr3);
374 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[4] * stride), hr4);
375 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[5] * stride), hr5);
376 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[6] * stride), hr6);
377 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[7] * stride), hr7);
378 | 	}
379 | 
380 | 	SIMD_INLINE void loadindexed8(V8f& v0, V8f& v1, V8f& v2, V8f& v3, V8f& v4, V8f& v5, V8f& v6, V8f& v7, const void* base, const int indices[8], unsigned int stride)
381 | 	{
382 | 		const char* ptr = static_cast<const char*>(base);
383 | 
384 | 		__m256 r0 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[0] * stride));
385 | 		__m256 r1 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[1] * stride));
386 | 		__m256 r2 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[2] * stride));
387 | 		__m256 r3 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[3] * stride));
388 | 		__m256 r4 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[4] * stride));
389 | 		__m256 r5 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[5] * stride));
390 | 		__m256 r6 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[6] * stride));
391 | 		__m256 r7 = _mm256_load_ps(reinterpret_cast<const float*>(ptr + indices[7] * stride));
392 | 
393 | 		_MM_TRANSPOSE8_PS(r0, r1, r2, r3, r4, r5, r6, r7);
394 | 
395 | 		v0.v = r0;
396 | 		v1.v = r1;
397 | 		v2.v = r2;
398 | 		v3.v = r3;
399 | 		v4.v = r4;
400 | 		v5.v = r5;
401 | 		v6.v = r6;
402 | 		v7.v = r7;
403 | 	}
404 | }
405 | 
406 | namespace simd
407 | {
408 | 	template <> struct VNf_<8> { typedef V8f type; };
409 | 	template <> struct VNi_<8> { typedef V8i type; };
410 | 	template <> struct VNb_<8> { typedef V8b type; };
411 | }
412 | 
413 | using simd::V8f;
414 | using simd::V8i;
415 | using simd::V8b;


--------------------------------------------------------------------------------
/src/base/SIMD_AVX2_Transpose.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // Transpose 4x4 blocks within each lane
 4 | #define _MM_TRANSPOSE8_LANE4_PS(row0, row1, row2, row3) \
 5 | 	do { \
 6 | 		__m256 __t0, __t1, __t2, __t3; \
 7 | 		__t0 = _mm256_unpacklo_ps(row0, row1); \
 8 | 		__t1 = _mm256_unpackhi_ps(row0, row1); \
 9 | 		__t2 = _mm256_unpacklo_ps(row2, row3); \
10 | 		__t3 = _mm256_unpackhi_ps(row2, row3); \
11 | 		row0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
12 | 		row1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
13 | 		row2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
14 | 		row3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
15 | 	} while (0)
16 | 
17 | // http://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
18 | #define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \
19 | 	do { \
20 | 		__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \
21 | 		__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \
22 | 		__t0 = _mm256_unpacklo_ps(row0, row1); \
23 | 		__t1 = _mm256_unpackhi_ps(row0, row1); \
24 | 		__t2 = _mm256_unpacklo_ps(row2, row3); \
25 | 		__t3 = _mm256_unpackhi_ps(row2, row3); \
26 | 		__t4 = _mm256_unpacklo_ps(row4, row5); \
27 | 		__t5 = _mm256_unpackhi_ps(row4, row5); \
28 | 		__t6 = _mm256_unpacklo_ps(row6, row7); \
29 | 		__t7 = _mm256_unpackhi_ps(row6, row7); \
30 | 		__tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); \
31 | 		__tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); \
32 | 		__tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); \
33 | 		__tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); \
34 | 		__tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); \
35 | 		__tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); \
36 | 		__tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); \
37 | 		__tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); \
38 | 		row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \
39 | 		row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \
40 | 		row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \
41 | 		row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \
42 | 		row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \
43 | 		row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \
44 | 		row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \
45 | 		row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \
46 | 	} while (0)
47 | 


--------------------------------------------------------------------------------
/src/base/SIMD_SSE2.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace simd
  4 | {
  5 | 	struct V4f
  6 | 	{
  7 | 		__m128 v;
  8 | 
  9 | 		SIMD_INLINE V4f()
 10 | 		{
 11 | 		}
 12 | 
 13 | 		SIMD_INLINE V4f(__m128 v): v(v)
 14 | 		{
 15 | 		}
 16 | 
 17 | 		SIMD_INLINE operator __m128() const
 18 | 		{
 19 | 			return v;
 20 | 		}
 21 | 
 22 | 		SIMD_INLINE static V4f zero()
 23 | 		{
 24 | 			return _mm_setzero_ps();
 25 | 		}
 26 | 
 27 | 		SIMD_INLINE static V4f one(float v)
 28 | 		{
 29 | 			return _mm_set1_ps(v);
 30 | 		}
 31 | 
 32 | 		SIMD_INLINE static V4f sign()
 33 | 		{
 34 | 			return _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 35 | 		}
 36 | 
 37 | 		SIMD_INLINE static V4f load(const float* ptr)
 38 | 		{
 39 | 			return _mm_load_ps(ptr);
 40 | 		}
 41 | 	};
 42 | 
 43 | 	struct V4i
 44 | 	{
 45 | 		__m128i v;
 46 | 
 47 | 		SIMD_INLINE V4i()
 48 | 		{
 49 | 		}
 50 | 
 51 | 		SIMD_INLINE V4i(__m128i v): v(v)
 52 | 		{
 53 | 		}
 54 | 
 55 | 		SIMD_INLINE operator __m128i() const
 56 | 		{
 57 | 			return v;
 58 | 		}
 59 | 
 60 | 		SIMD_INLINE static V4i zero()
 61 | 		{
 62 | 			return _mm_setzero_si128();
 63 | 		}
 64 | 
 65 | 		SIMD_INLINE static V4i one(int v)
 66 | 		{
 67 | 			return _mm_set1_epi32(v);
 68 | 		}
 69 | 
 70 | 		SIMD_INLINE static V4i load(const int* ptr)
 71 | 		{
 72 | 			return _mm_load_si128(reinterpret_cast<const __m128i*>(ptr));
 73 | 		}
 74 | 	};
 75 | 
 76 | 	struct V4b
 77 | 	{
 78 | 		__m128 v;
 79 | 
 80 | 		SIMD_INLINE V4b()
 81 | 		{
 82 | 		}
 83 | 
 84 | 		SIMD_INLINE V4b(__m128 v): v(v)
 85 | 		{
 86 | 		}
 87 | 
 88 | 		SIMD_INLINE V4b(__m128i v): v(_mm_castsi128_ps(v))
 89 | 		{
 90 | 		}
 91 | 
 92 | 		SIMD_INLINE operator __m128() const
 93 | 		{
 94 | 			return v;
 95 | 		}
 96 | 
 97 | 		SIMD_INLINE static V4b zero()
 98 | 		{
 99 | 			return _mm_setzero_ps();
100 | 		}
101 | 	};
102 | 
103 | 	SIMD_INLINE V4f bitcast(V4i v)
104 | 	{
105 | 		return _mm_castsi128_ps(v.v);
106 | 	}
107 | 
108 | 	SIMD_INLINE V4i bitcast(V4f v)
109 | 	{
110 | 		return _mm_castps_si128(v.v);
111 | 	}
112 | 
113 | 	SIMD_INLINE V4f operator+(V4f v)
114 | 	{
115 | 		return v;
116 | 	}
117 | 
118 | 	SIMD_INLINE V4f operator-(V4f v)
119 | 	{
120 | 		return _mm_xor_ps(V4f::sign(), v.v);
121 | 	}
122 | 
123 | 	SIMD_INLINE V4f operator+(V4f l, V4f r)
124 | 	{
125 | 		return _mm_add_ps(l.v, r.v);
126 | 	}
127 | 
128 | 	SIMD_INLINE V4f operator-(V4f l, V4f r)
129 | 	{
130 | 		return _mm_sub_ps(l.v, r.v);
131 | 	}
132 | 
133 | 	SIMD_INLINE V4f operator*(V4f l, V4f r)
134 | 	{
135 | 		return _mm_mul_ps(l.v, r.v);
136 | 	}
137 | 
138 | 	SIMD_INLINE V4f operator/(V4f l, V4f r)
139 | 	{
140 | 		return _mm_div_ps(l.v, r.v);
141 | 	}
142 | 
143 | 	SIMD_INLINE void operator+=(V4f& l, V4f r)
144 | 	{
145 | 		l.v = _mm_add_ps(l.v, r.v);
146 | 	}
147 | 
148 | 	SIMD_INLINE void operator-=(V4f& l, V4f r)
149 | 	{
150 | 		l.v = _mm_sub_ps(l.v, r.v);
151 | 	}
152 | 
153 | 	SIMD_INLINE void operator*=(V4f& l, V4f r)
154 | 	{
155 | 		l.v = _mm_mul_ps(l.v, r.v);
156 | 	}
157 | 
158 | 	SIMD_INLINE void operator/=(V4f& l, V4f r)
159 | 	{
160 | 		l.v = _mm_div_ps(l.v, r.v);
161 | 	}
162 | 
163 | 	SIMD_INLINE V4b operator==(V4f l, V4f r)
164 | 	{
165 | 		return _mm_cmpeq_ps(l.v, r.v);
166 | 	}
167 | 
168 | 	SIMD_INLINE V4b operator==(V4i l, V4i r)
169 | 	{
170 | 		return _mm_cmpeq_epi32(l.v, r.v);
171 | 	}
172 | 
173 | 	SIMD_INLINE V4b operator!=(V4f l, V4f r)
174 | 	{
175 | 		return _mm_cmpneq_ps(l.v, r.v);
176 | 	}
177 | 
178 | 	SIMD_INLINE V4b operator!=(V4i l, V4i r)
179 | 	{
180 | 		return _mm_xor_si128(_mm_setzero_si128(), _mm_cmpeq_epi32(l.v, r.v));
181 | 	}
182 | 
183 | 	SIMD_INLINE V4b operator<(V4f l, V4f r)
184 | 	{
185 | 		return _mm_cmplt_ps(l.v, r.v);
186 | 	}
187 | 
188 | 	SIMD_INLINE V4b operator<(V4i l, V4i r)
189 | 	{
190 | 		return _mm_cmplt_epi32(l.v, r.v);
191 | 	}
192 | 
193 | 	SIMD_INLINE V4b operator<=(V4f l, V4f r)
194 | 	{
195 | 		return _mm_cmple_ps(l.v, r.v);
196 | 	}
197 | 
198 | 	SIMD_INLINE V4b operator<=(V4i l, V4i r)
199 | 	{
200 | 		return _mm_xor_si128(_mm_setzero_si128(), _mm_cmplt_epi32(r.v, l.v));
201 | 	}
202 | 
203 | 	SIMD_INLINE V4b operator>(V4f l, V4f r)
204 | 	{
205 | 		return _mm_cmpgt_ps(l.v, r.v);
206 | 	}
207 | 
208 | 	SIMD_INLINE V4b operator>(V4i l, V4i r)
209 | 	{
210 | 		return _mm_cmpgt_epi32(l.v, r.v);
211 | 	}
212 | 
213 | 	SIMD_INLINE V4b operator>=(V4f l, V4f r)
214 | 	{
215 | 		return _mm_cmpge_ps(l.v, r.v);
216 | 	}
217 | 
218 | 	SIMD_INLINE V4b operator>=(V4i l, V4i r)
219 | 	{
220 | 		return _mm_xor_si128(_mm_setzero_si128(), _mm_cmplt_epi32(l.v, r.v));
221 | 	}
222 | 
223 | 	SIMD_INLINE V4b operator!(V4b v)
224 | 	{
225 | 		return _mm_xor_ps(_mm_setzero_ps(), v.v);
226 | 	}
227 | 
228 | 	SIMD_INLINE V4b operator&(V4b l, V4b r)
229 | 	{
230 | 		return _mm_and_ps(l.v, r.v);
231 | 	}
232 | 
233 | 	SIMD_INLINE V4b operator|(V4b l, V4b r)
234 | 	{
235 | 		return _mm_or_ps(l.v, r.v);
236 | 	}
237 | 
238 | 	SIMD_INLINE V4b operator^(V4b l, V4b r)
239 | 	{
240 | 		return _mm_xor_ps(l.v, r.v);
241 | 	}
242 | 
243 | 	SIMD_INLINE void operator&=(V4b& l, V4b r)
244 | 	{
245 | 		l.v = _mm_and_ps(l.v, r.v);
246 | 	}
247 | 
248 | 	SIMD_INLINE void operator|=(V4b& l, V4b r)
249 | 	{
250 | 		l.v = _mm_or_ps(l.v, r.v);
251 | 	}
252 | 
253 | 	SIMD_INLINE void operator^=(V4b l, V4b r)
254 | 	{
255 | 		l.v = _mm_xor_ps(l.v, r.v);
256 | 	}
257 | 
258 | 	SIMD_INLINE V4f abs(V4f v)
259 | 	{
260 | 		return _mm_andnot_ps(V4f::sign(), v.v);
261 | 	}
262 | 
263 | 	SIMD_INLINE V4f copysign(V4f x, V4f y)
264 | 	{
265 | 		V4f sign = V4f::sign();
266 | 
267 | 		return _mm_or_ps(_mm_andnot_ps(sign.v, x.v), _mm_and_ps(y.v, sign.v));
268 | 	}
269 | 
270 | 	SIMD_INLINE V4f flipsign(V4f x, V4f y)
271 | 	{
272 | 		return _mm_xor_ps(x.v, _mm_and_ps(y.v, V4f::sign()));
273 | 	}
274 | 
275 | 	SIMD_INLINE V4f min(V4f l, V4f r)
276 | 	{
277 | 		return _mm_min_ps(l.v, r.v);
278 | 	}
279 | 
280 | 	SIMD_INLINE V4f max(V4f l, V4f r)
281 | 	{
282 | 		return _mm_max_ps(l.v, r.v);
283 | 	}
284 | 
285 | 	SIMD_INLINE V4f select(V4f l, V4f r, V4b m)
286 | 	{
287 | 		return _mm_or_ps(_mm_andnot_ps(m.v, l.v), _mm_and_ps(r.v, m.v));
288 | 	}
289 | 
290 | 	SIMD_INLINE V4i select(V4i l, V4i r, V4b m)
291 | 	{
292 | 		__m128i mi = _mm_castps_si128(m.v);
293 | 
294 | 		return _mm_or_si128(_mm_andnot_si128(mi, l.v), _mm_and_si128(r.v, mi));
295 | 	}
296 | 
297 | 	SIMD_INLINE bool none(V4b v)
298 | 	{
299 | 		return _mm_movemask_ps(v.v) == 0;
300 | 	}
301 | 
302 | 	SIMD_INLINE bool any(V4b v)
303 | 	{
304 | 		return _mm_movemask_ps(v.v) != 0;
305 | 	}
306 | 
307 | 	SIMD_INLINE bool all(V4b v)
308 | 	{
309 | 		return _mm_movemask_ps(v.v) == 15;
310 | 	}
311 | 
312 | 	SIMD_INLINE void store(V4f v, float* ptr)
313 | 	{
314 | 		_mm_store_ps(ptr, v.v);
315 | 	}
316 | 
317 | 	SIMD_INLINE void store(V4i v, int* ptr)
318 | 	{
319 | 		_mm_store_si128(reinterpret_cast<__m128i*>(ptr), v.v);
320 | 	}
321 | 
322 | 	SIMD_INLINE void loadindexed4(V4f& v0, V4f& v1, V4f& v2, V4f& v3, const void* base, const int indices[4], unsigned int stride)
323 | 	{
324 | 		const char* ptr = static_cast<const char*>(base);
325 | 
326 | 		__m128 r0 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[0] * stride));
327 | 		__m128 r1 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[1] * stride));
328 | 		__m128 r2 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[2] * stride));
329 | 		__m128 r3 = _mm_load_ps(reinterpret_cast<const float*>(ptr + indices[3] * stride));
330 | 
331 | 		_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
332 | 
333 | 		v0.v = r0;
334 | 		v1.v = r1;
335 | 		v2.v = r2;
336 | 		v3.v = r3;
337 | 	}
338 | 
339 | 	SIMD_INLINE void storeindexed4(const V4f& v0, const V4f& v1, const V4f& v2, const V4f& v3, void* base, const int indices[4], unsigned int stride)
340 | 	{
341 | 		char* ptr = static_cast<char*>(base);
342 | 
343 | 		__m128 r0 = v0.v;
344 | 		__m128 r1 = v1.v;
345 | 		__m128 r2 = v2.v;
346 | 		__m128 r3 = v3.v;
347 | 
348 | 		_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
349 | 
350 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[0] * stride), r0);
351 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[1] * stride), r1);
352 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[2] * stride), r2);
353 | 		_mm_store_ps(reinterpret_cast<float*>(ptr + indices[3] * stride), r3);
354 | 	}
355 | 
356 | 	SIMD_INLINE void loadindexed8(V4f& v0, V4f& v1, V4f& v2, V4f& v3, V4f& v4, V4f& v5, V4f& v6, V4f& v7, const void* base, const int indices[4], unsigned int stride)
357 | 	{
358 | 		const char* ptr = static_cast<const char*>(base);
359 | 
360 | 		const float* ptr0 = reinterpret_cast<const float*>(ptr + indices[0] * stride);
361 | 		const float* ptr1 = reinterpret_cast<const float*>(ptr + indices[1] * stride);
362 | 		const float* ptr2 = reinterpret_cast<const float*>(ptr + indices[2] * stride);
363 | 		const float* ptr3 = reinterpret_cast<const float*>(ptr + indices[3] * stride);
364 | 
365 | 		__m128 r0 = _mm_load_ps(ptr0 + 0);
366 | 		__m128 r1 = _mm_load_ps(ptr1 + 0);
367 | 		__m128 r2 = _mm_load_ps(ptr2 + 0);
368 | 		__m128 r3 = _mm_load_ps(ptr3 + 0);
369 | 		__m128 r4 = _mm_load_ps(ptr0 + 4);
370 | 		__m128 r5 = _mm_load_ps(ptr1 + 4);
371 | 		__m128 r6 = _mm_load_ps(ptr2 + 4);
372 | 		__m128 r7 = _mm_load_ps(ptr3 + 4);
373 | 
374 | 		_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
375 | 		_MM_TRANSPOSE4_PS(r4, r5, r6, r7);
376 | 
377 | 		v0.v = r0;
378 | 		v1.v = r1;
379 | 		v2.v = r2;
380 | 		v3.v = r3;
381 | 		v4.v = r4;
382 | 		v5.v = r5;
383 | 		v6.v = r6;
384 | 		v7.v = r7;
385 | 	}
386 | }
387 | 
388 | namespace simd
389 | {
390 | 	template <> struct VNf_<4> { typedef V4f type; };
391 | 	template <> struct VNi_<4> { typedef V4i type; };
392 | 	template <> struct VNb_<4> { typedef V4b type; };
393 | }
394 | 
395 | using simd::V4f;
396 | using simd::V4i;
397 | using simd::V4b;
398 | 


--------------------------------------------------------------------------------
/src/base/SIMD_Scalar.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <math.h>
  4 | 
  5 | namespace simd
  6 | {
  7 | 	struct V1f
  8 | 	{
  9 | 		float v;
 10 | 
 11 | 		SIMD_INLINE V1f()
 12 | 		{
 13 | 		}
 14 | 
 15 | 		SIMD_INLINE V1f(float v): v(v)
 16 | 		{
 17 | 		}
 18 | 
 19 | 		SIMD_INLINE operator float() const
 20 | 		{
 21 | 			return v;
 22 | 		}
 23 | 
 24 | 		SIMD_INLINE static V1f zero()
 25 | 		{
 26 | 			return 0.f;
 27 | 		}
 28 | 
 29 | 		SIMD_INLINE static V1f one(float v)
 30 | 		{
 31 | 			return v;
 32 | 		}
 33 | 
 34 | 		SIMD_INLINE static V1f load(const float* ptr)
 35 | 		{
 36 | 			return *ptr;
 37 | 		}
 38 | 	};
 39 | 
 40 | 	struct V1i
 41 | 	{
 42 | 		int v;
 43 | 
 44 | 		SIMD_INLINE V1i()
 45 | 		{
 46 | 		}
 47 | 
 48 | 		SIMD_INLINE V1i(int v): v(v)
 49 | 		{
 50 | 		}
 51 | 
 52 | 		SIMD_INLINE operator int() const
 53 | 		{
 54 | 			return v;
 55 | 		}
 56 | 
 57 | 		SIMD_INLINE static V1i zero()
 58 | 		{
 59 | 			return 0;
 60 | 		}
 61 | 
 62 | 		SIMD_INLINE static V1i one(int v)
 63 | 		{
 64 | 			return v;
 65 | 		}
 66 | 
 67 | 		SIMD_INLINE static V1i load(const int* ptr)
 68 | 		{
 69 | 			return *ptr;
 70 | 		}
 71 | 	};
 72 | 
 73 | 	struct V1b
 74 | 	{
 75 | 		bool v;
 76 | 
 77 | 		SIMD_INLINE V1b()
 78 | 		{
 79 | 		}
 80 | 
 81 | 		SIMD_INLINE V1b(bool v): v(v)
 82 | 		{
 83 | 		}
 84 | 
 85 | 		SIMD_INLINE operator bool() const
 86 | 		{
 87 | 			return v;
 88 | 		}
 89 | 
 90 | 		SIMD_INLINE static V1b zero()
 91 | 		{
 92 | 			return false;
 93 | 		}
 94 | 	};
 95 | 
 96 | 	SIMD_INLINE V1f bitcast(const V1i& v)
 97 | 	{
 98 | 		union { float f; int i; } u;
 99 | 		u.i = v.v;
100 | 		return u.f;
101 | 	}
102 | 
103 | 	SIMD_INLINE V1i bitcast(const V1f& v)
104 | 	{
105 | 		union { float f; int i; } u;
106 | 		u.f = v.v;
107 | 		return u.i;
108 | 	}
109 | 
110 | 	SIMD_INLINE V1f operator+(V1f v)
111 | 	{
112 | 		return v;
113 | 	}
114 | 
115 | 	SIMD_INLINE V1f operator-(V1f v)
116 | 	{
117 | 		return -v.v;
118 | 	}
119 | 
120 | 	SIMD_INLINE V1f operator+(V1f l, V1f r)
121 | 	{
122 | 		return l.v + r.v;
123 | 	}
124 | 
125 | 	SIMD_INLINE V1f operator-(V1f l, V1f r)
126 | 	{
127 | 		return l.v - r.v;
128 | 	}
129 | 
130 | 	SIMD_INLINE V1f operator*(V1f l, V1f r)
131 | 	{
132 | 		return l.v * r.v;
133 | 	}
134 | 
135 | 	SIMD_INLINE V1f operator/(V1f l, V1f r)
136 | 	{
137 | 		return l.v / r.v;
138 | 	}
139 | 
140 | 	SIMD_INLINE void operator+=(V1f& l, V1f r)
141 | 	{
142 | 		l.v += r.v;
143 | 	}
144 | 
145 | 	SIMD_INLINE void operator-=(V1f& l, V1f r)
146 | 	{
147 | 		l.v -= r.v;
148 | 	}
149 | 
150 | 	SIMD_INLINE void operator*=(V1f& l, V1f r)
151 | 	{
152 | 		l.v *= r.v;
153 | 	}
154 | 
155 | 	SIMD_INLINE void operator/=(V1f& l, V1f r)
156 | 	{
157 | 		l.v /= r.v;
158 | 	}
159 | 
160 | 	SIMD_INLINE V1b operator==(V1f l, V1f r)
161 | 	{
162 | 		return l.v == r.v;
163 | 	}
164 | 
165 | 	SIMD_INLINE V1b operator==(V1i l, V1i r)
166 | 	{
167 | 		return l.v == r.v;
168 | 	}
169 | 
170 | 	SIMD_INLINE V1b operator!=(V1f l, V1f r)
171 | 	{
172 | 		return l.v != r.v;
173 | 	}
174 | 
175 | 	SIMD_INLINE V1b operator!=(V1i l, V1i r)
176 | 	{
177 | 		return l.v != r.v;
178 | 	}
179 | 
180 | 	SIMD_INLINE V1b operator<(V1f l, V1f r)
181 | 	{
182 | 		return l.v < r.v;
183 | 	}
184 | 
185 | 	SIMD_INLINE V1b operator<(V1i l, V1i r)
186 | 	{
187 | 		return l.v < r.v;
188 | 	}
189 | 
190 | 	SIMD_INLINE V1b operator<=(V1f l, V1f r)
191 | 	{
192 | 		return l.v <= r.v;
193 | 	}
194 | 
195 | 	SIMD_INLINE V1b operator<=(V1i l, V1i r)
196 | 	{
197 | 		return l.v <= r.v;
198 | 	}
199 | 
200 | 	SIMD_INLINE V1b operator>(V1f l, V1f r)
201 | 	{
202 | 		return l.v > r.v;
203 | 	}
204 | 
205 | 	SIMD_INLINE V1b operator>(V1i l, V1i r)
206 | 	{
207 | 		return l.v > r.v;
208 | 	}
209 | 
210 | 	SIMD_INLINE V1b operator>=(V1f l, V1f r)
211 | 	{
212 | 		return l.v >= r.v;
213 | 	}
214 | 
215 | 	SIMD_INLINE V1b operator>=(V1i l, V1i r)
216 | 	{
217 | 		return l.v >= r.v;
218 | 	}
219 | 
220 | 	SIMD_INLINE V1b operator!(V1b v)
221 | 	{
222 | 		return !v.v;
223 | 	}
224 | 
225 | 	SIMD_INLINE V1b operator&(V1b l, V1b r)
226 | 	{
227 | 		return l.v & r.v;
228 | 	}
229 | 
230 | 	SIMD_INLINE V1b operator|(V1b l, V1b r)
231 | 	{
232 | 		return l.v | r.v;
233 | 	}
234 | 
235 | 	SIMD_INLINE V1b operator^(V1b l, V1b r)
236 | 	{
237 | 		return l.v ^ r.v;
238 | 	}
239 | 
240 | 	SIMD_INLINE void operator&=(V1b& l, V1b r)
241 | 	{
242 | 		l.v &= r.v;
243 | 	}
244 | 
245 | 	SIMD_INLINE void operator|=(V1b& l, V1b r)
246 | 	{
247 | 		l.v |= r.v;
248 | 	}
249 | 
250 | 	SIMD_INLINE void operator^=(V1b l, V1b r)
251 | 	{
252 | 		l.v ^= r.v;
253 | 	}
254 | 
255 | 	SIMD_INLINE V1f abs(V1f v)
256 | 	{
257 | 		return fabsf(v.v);
258 | 	}
259 | 
260 | 	SIMD_INLINE V1f copysign(V1f x, V1f y)
261 | 	{
262 | 		return copysignf(x, y);
263 | 	}
264 | 
265 | 	SIMD_INLINE V1f flipsign(V1f x, V1f y)
266 | 	{
267 | 		return y.v < 0.f ? -x.v : x.v;
268 | 	}
269 | 
270 | 	SIMD_INLINE V1f min(V1f l, V1f r)
271 | 	{
272 | 		return l.v < r.v ? l.v : r.v;
273 | 	}
274 | 
275 | 	SIMD_INLINE V1f max(V1f l, V1f r)
276 | 	{
277 | 		return l.v > r.v ? l.v : r.v;
278 | 	}
279 | 
280 | 	SIMD_INLINE V1f select(V1f l, V1f r, V1b m)
281 | 	{
282 | 		return m.v ? r.v : l.v;
283 | 	}
284 | 
285 | 	SIMD_INLINE V1i select(V1i l, V1i r, V1b m)
286 | 	{
287 | 		return m.v ? r.v : l.v;
288 | 	}
289 | 
290 | 	SIMD_INLINE bool none(V1b v)
291 | 	{
292 | 		return !v.v;
293 | 	}
294 | 
295 | 	SIMD_INLINE bool any(V1b v)
296 | 	{
297 | 		return v.v;
298 | 	}
299 | 
300 | 	SIMD_INLINE bool all(V1b v)
301 | 	{
302 | 		return v.v;
303 | 	}
304 | 
305 | 	SIMD_INLINE void store(V1f v, float* ptr)
306 | 	{
307 | 		*ptr = v.v;
308 | 	}
309 | 
310 | 	SIMD_INLINE void store(V1i v, int* ptr)
311 | 	{
312 | 		*ptr = v.v;
313 | 	}
314 | 
315 | 	SIMD_INLINE void loadindexed4(V1f& v0, V1f& v1, V1f& v2, V1f& v3, const void* base, const int indices[1], unsigned int stride)
316 | 	{
317 | 		const float* ptr = reinterpret_cast<const float*>(static_cast<const char*>(base) + indices[0] * stride);
318 | 
319 | 		v0.v = ptr[0];
320 | 		v1.v = ptr[1];
321 | 		v2.v = ptr[2];
322 | 		v3.v = ptr[3];
323 | 	}
324 | 
325 | 	SIMD_INLINE void storeindexed4(const V1f& v0, const V1f& v1, const V1f& v2, const V1f& v3, void* base, const int indices[1], unsigned int stride)
326 | 	{
327 | 		float* ptr = reinterpret_cast<float*>(static_cast<char*>(base) + indices[0] * stride);
328 | 
329 | 		ptr[0] = v0.v;
330 | 		ptr[1] = v1.v;
331 | 		ptr[2] = v2.v;
332 | 		ptr[3] = v3.v;
333 | 	}
334 | 
335 | 	SIMD_INLINE void loadindexed8(V1f& v0, V1f& v1, V1f& v2, V1f& v3, V1f& v4, V1f& v5, V1f& v6, V1f& v7, const void* base, const int indices[1], unsigned int stride)
336 | 	{
337 | 		const float* ptr = reinterpret_cast<const float*>(static_cast<const char*>(base) + indices[0] * stride);
338 | 
339 | 		v0.v = ptr[0];
340 | 		v1.v = ptr[1];
341 | 		v2.v = ptr[2];
342 | 		v3.v = ptr[3];
343 | 		v4.v = ptr[4];
344 | 		v5.v = ptr[5];
345 | 		v6.v = ptr[6];
346 | 		v7.v = ptr[7];
347 | 	}
348 | }
349 | 
350 | namespace simd
351 | {
352 | 	template <> struct VNf_<1> { typedef V1f type; };
353 | 	template <> struct VNi_<1> { typedef V1i type; };
354 | 	template <> struct VNb_<1> { typedef V1b type; };
355 | }
356 | 
357 | using simd::V1f;
358 | using simd::V1i;
359 | using simd::V1b;
360 | 


--------------------------------------------------------------------------------
/src/base/WorkQueue.cpp:
--------------------------------------------------------------------------------
 1 | #include "WorkQueue.h"
 2 | 
 3 | #include "microprofile.h"
 4 | 
 5 | unsigned int WorkQueue::getIdealWorkerCount()
 6 | {
 7 |     return std::max(std::thread::hardware_concurrency(), 1u);
 8 | }
 9 | 
10 | WorkQueue::WorkQueue(unsigned int workerCount)
11 | {
12 |     for (unsigned int i = 0; i < workerCount; ++i)
13 |         workers.emplace_back(workerThreadFun, this, i);
14 | }
15 | 
16 | WorkQueue::~WorkQueue()
17 | {
18 |     pushItem(std::shared_ptr<Item>(), workers.size());
19 | 
20 |     for (unsigned int i = 0; i < workers.size(); ++i)
21 |         workers[i].join();
22 | }
23 | 
24 | void WorkQueue::pushItem(std::shared_ptr<Item> item, int count)
25 | {
26 |     MICROPROFILE_SCOPEI("WorkQueue", "Push", -1);
27 | 
28 |     std::unique_lock<std::mutex> lock(itemsMutex);
29 | 
30 |     items.push(std::make_pair(std::move(item), count));
31 | 
32 |     lock.unlock();
33 | 
34 |     MICROPROFILE_SCOPEI("WorkQueue", "Notify", -1);
35 | 
36 |     if (count > 1)
37 |         itemsCondition.notify_all();
38 |     else
39 |         itemsCondition.notify_one();
40 | }
41 | 
42 | void WorkQueue::workerThreadFun(WorkQueue* queue, int worker)
43 | {
44 |     char name[16];
45 |     sprintf(name, "Worker %d", worker);
46 |     MicroProfileOnThreadCreate(name);
47 | 
48 |     for (;;)
49 |     {
50 |         std::shared_ptr<Item> item;
51 | 
52 |         {
53 |             std::unique_lock<std::mutex> lock(queue->itemsMutex);
54 | 
55 |             queue->itemsCondition.wait(lock, [&]() { return !queue->items.empty(); });
56 | 
57 |             auto& slot = queue->items.front();
58 | 
59 |             if (slot.second <= 1)
60 |             {
61 |                 item = std::move(slot.first);
62 |                 queue->items.pop();
63 |             }
64 |             else
65 |             {
66 |                 item = slot.first;
67 |                 slot.second--;
68 |             }
69 |         }
70 | 
71 |         if (!item) break;
72 | 
73 |         MICROPROFILE_SCOPEI("WorkQueue", "JobRun", 0x606060);
74 | 
75 |         item->run(worker);
76 |     }
77 | 
78 |     MicroProfileOnThreadExit();
79 | }


--------------------------------------------------------------------------------
/src/base/WorkQueue.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <functional>
 6 | #include <queue>
 7 | #include <mutex>
 8 | #include <condition_variable>
 9 | 
10 | class WorkQueue
11 | {
12 | public:
13 |     struct Item
14 |     {
15 |         virtual ~Item() {}
16 | 
17 |         virtual void run(int worker) = 0;
18 |     };
19 | 
20 |     static unsigned int getIdealWorkerCount();
21 | 
22 |     WorkQueue(unsigned int workerCount);
23 |     ~WorkQueue();
24 | 
25 |     void pushItem(std::shared_ptr<Item> item, int count = 1);
26 | 
27 |     template <typename F>
28 |     void pushFunction(F fun, int count = 1)
29 |     {
30 |         pushItem(std::make_shared<Item>(std::move(fun)));
31 |     }
32 | 
33 |     unsigned int getWorkerCount() const
34 |     {
35 |         return workers.size();
36 |     }
37 | 
38 |   private:
39 |     std::vector<std::thread> workers;
40 | 
41 |     std::mutex itemsMutex;
42 |     std::condition_variable itemsCondition;
43 |     std::queue<std::pair<std::shared_ptr<Item>, int>> items;
44 | 
45 |     static void workerThreadFun(WorkQueue* queue, int worker);
46 | 
47 |     template <typename T>
48 |     struct ItemFunction: WorkQueue::Item
49 |     {
50 |         T function;
51 | 
52 |         ItemFunction(T function): function(std::move(function))
53 |         {
54 |         }
55 | 
56 |         void run(int worker) override
57 |         {
58 |             function();
59 |         }
60 |     };
61 | 
62 | };


--------------------------------------------------------------------------------
/src/base/microprofile.cpp:
--------------------------------------------------------------------------------
 1 | #define MICROPROFILE_HELP_ALT "Right-Click"
 2 | #define MICROPROFILE_HELP_MOD "Ctrl"
 3 | 
 4 | #ifdef _WIN32
 5 | #include "../glad/glad.h"
 6 | #endif
 7 | 
 8 | #define GL_GLEXT_PROTOTYPES
 9 | #include <GLFW/glfw3.h>
10 | 
11 | #ifdef __APPLE__
12 | #define GL_DO_NOT_WARN_IF_MULTI_GL_VERSION_HEADERS_INCLUDED
13 | #include <OpenGL/gl3.h>
14 | #endif
15 | 
16 | #define MICROPROFILE_WEBSERVER 1
17 | #define MICROPROFILE_GPU_TIMERS_GL 1
18 | 
19 | #define MICROPROFILE_IMPL
20 | #include "microprofile.h"
21 | 
22 | #define MICROPROFILEUI_IMPL
23 | #include "microprofileui.h"
24 | 
25 | #define MICROPROFILEDRAW_IMPL
26 | #include "microprofiledraw.h"
27 | 
28 | #ifdef __linux__
29 | #define GL_PROC(ret, name, args, argcall) \
30 | 	ret name args { \
31 | 		static ret (*ptr) args = reinterpret_cast<ret (*) args>(glfwGetProcAddress(#name)); \
32 | 		return ptr argcall; \
33 | 	}
34 | 
35 | GL_PROC(void, glQueryCounter, (GLuint id, GLenum target), (id, target))
36 | GL_PROC(void, glGetQueryObjectui64v, (GLuint id, GLenum pname, GLuint64 *params), (id, pname, params))
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/glad/khrplatform.h:
--------------------------------------------------------------------------------
  1 | #ifndef __khrplatform_h_
  2 | #define __khrplatform_h_
  3 | 
  4 | /*
  5 | ** Copyright (c) 2008-2018 The Khronos Group Inc.
  6 | **
  7 | ** Permission is hereby granted, free of charge, to any person obtaining a
  8 | ** copy of this software and/or associated documentation files (the
  9 | ** "Materials"), to deal in the Materials without restriction, including
 10 | ** without limitation the rights to use, copy, modify, merge, publish,
 11 | ** distribute, sublicense, and/or sell copies of the Materials, and to
 12 | ** permit persons to whom the Materials are furnished to do so, subject to
 13 | ** the following conditions:
 14 | **
 15 | ** The above copyright notice and this permission notice shall be included
 16 | ** in all copies or substantial portions of the Materials.
 17 | **
 18 | ** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 20 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 21 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 22 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 23 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 24 | ** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 25 | */
 26 | 
 27 | /* Khronos platform-specific types and definitions.
 28 |  *
 29 |  * The master copy of khrplatform.h is maintained in the Khronos EGL
 30 |  * Registry repository at https://github.com/KhronosGroup/EGL-Registry
 31 |  * The last semantic modification to khrplatform.h was at commit ID:
 32 |  *      67a3e0864c2d75ea5287b9f3d2eb74a745936692
 33 |  *
 34 |  * Adopters may modify this file to suit their platform. Adopters are
 35 |  * encouraged to submit platform specific modifications to the Khronos
 36 |  * group so that they can be included in future versions of this file.
 37 |  * Please submit changes by filing pull requests or issues on
 38 |  * the EGL Registry repository linked above.
 39 |  *
 40 |  *
 41 |  * See the Implementer's Guidelines for information about where this file
 42 |  * should be located on your system and for more details of its use:
 43 |  *    http://www.khronos.org/registry/implementers_guide.pdf
 44 |  *
 45 |  * This file should be included as
 46 |  *        #include <KHR/khrplatform.h>
 47 |  * by Khronos client API header files that use its types and defines.
 48 |  *
 49 |  * The types in khrplatform.h should only be used to define API-specific types.
 50 |  *
 51 |  * Types defined in khrplatform.h:
 52 |  *    khronos_int8_t              signed   8  bit
 53 |  *    khronos_uint8_t             unsigned 8  bit
 54 |  *    khronos_int16_t             signed   16 bit
 55 |  *    khronos_uint16_t            unsigned 16 bit
 56 |  *    khronos_int32_t             signed   32 bit
 57 |  *    khronos_uint32_t            unsigned 32 bit
 58 |  *    khronos_int64_t             signed   64 bit
 59 |  *    khronos_uint64_t            unsigned 64 bit
 60 |  *    khronos_intptr_t            signed   same number of bits as a pointer
 61 |  *    khronos_uintptr_t           unsigned same number of bits as a pointer
 62 |  *    khronos_ssize_t             signed   size
 63 |  *    khronos_usize_t             unsigned size
 64 |  *    khronos_float_t             signed   32 bit floating point
 65 |  *    khronos_time_ns_t           unsigned 64 bit time in nanoseconds
 66 |  *    khronos_utime_nanoseconds_t unsigned time interval or absolute time in
 67 |  *                                         nanoseconds
 68 |  *    khronos_stime_nanoseconds_t signed time interval in nanoseconds
 69 |  *    khronos_boolean_enum_t      enumerated boolean type. This should
 70 |  *      only be used as a base type when a client API's boolean type is
 71 |  *      an enum. Client APIs which use an integer or other type for
 72 |  *      booleans cannot use this as the base type for their boolean.
 73 |  *
 74 |  * Tokens defined in khrplatform.h:
 75 |  *
 76 |  *    KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values.
 77 |  *
 78 |  *    KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0.
 79 |  *    KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0.
 80 |  *
 81 |  * Calling convention macros defined in this file:
 82 |  *    KHRONOS_APICALL
 83 |  *    KHRONOS_APIENTRY
 84 |  *    KHRONOS_APIATTRIBUTES
 85 |  *
 86 |  * These may be used in function prototypes as:
 87 |  *
 88 |  *      KHRONOS_APICALL void KHRONOS_APIENTRY funcname(
 89 |  *                                  int arg1,
 90 |  *                                  int arg2) KHRONOS_APIATTRIBUTES;
 91 |  */
 92 | 
 93 | /*-------------------------------------------------------------------------
 94 |  * Definition of KHRONOS_APICALL
 95 |  *-------------------------------------------------------------------------
 96 |  * This precedes the return type of the function in the function prototype.
 97 |  */
 98 | #if defined(_WIN32) && !defined(__SCITECH_SNAP__)
 99 | #   define KHRONOS_APICALL __declspec(dllimport)
100 | #elif defined (__SYMBIAN32__)
101 | #   define KHRONOS_APICALL IMPORT_C
102 | #elif defined(__ANDROID__)
103 | #   define KHRONOS_APICALL __attribute__((visibility("default")))
104 | #else
105 | #   define KHRONOS_APICALL
106 | #endif
107 | 
108 | /*-------------------------------------------------------------------------
109 |  * Definition of KHRONOS_APIENTRY
110 |  *-------------------------------------------------------------------------
111 |  * This follows the return type of the function  and precedes the function
112 |  * name in the function prototype.
113 |  */
114 | #if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
115 |     /* Win32 but not WinCE */
116 | #   define KHRONOS_APIENTRY __stdcall
117 | #else
118 | #   define KHRONOS_APIENTRY
119 | #endif
120 | 
121 | /*-------------------------------------------------------------------------
122 |  * Definition of KHRONOS_APIATTRIBUTES
123 |  *-------------------------------------------------------------------------
124 |  * This follows the closing parenthesis of the function prototype arguments.
125 |  */
126 | #if defined (__ARMCC_2__)
127 | #define KHRONOS_APIATTRIBUTES __softfp
128 | #else
129 | #define KHRONOS_APIATTRIBUTES
130 | #endif
131 | 
132 | /*-------------------------------------------------------------------------
133 |  * basic type definitions
134 |  *-----------------------------------------------------------------------*/
135 | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__)
136 | 
137 | 
138 | /*
139 |  * Using <stdint.h>
140 |  */
141 | #include <stdint.h>
142 | typedef int32_t                 khronos_int32_t;
143 | typedef uint32_t                khronos_uint32_t;
144 | typedef int64_t                 khronos_int64_t;
145 | typedef uint64_t                khronos_uint64_t;
146 | #define KHRONOS_SUPPORT_INT64   1
147 | #define KHRONOS_SUPPORT_FLOAT   1
148 | 
149 | #elif defined(__VMS ) || defined(__sgi)
150 | 
151 | /*
152 |  * Using <inttypes.h>
153 |  */
154 | #include <inttypes.h>
155 | typedef int32_t                 khronos_int32_t;
156 | typedef uint32_t                khronos_uint32_t;
157 | typedef int64_t                 khronos_int64_t;
158 | typedef uint64_t                khronos_uint64_t;
159 | #define KHRONOS_SUPPORT_INT64   1
160 | #define KHRONOS_SUPPORT_FLOAT   1
161 | 
162 | #elif defined(_WIN32) && !defined(__SCITECH_SNAP__)
163 | 
164 | /*
165 |  * Win32
166 |  */
167 | typedef __int32                 khronos_int32_t;
168 | typedef unsigned __int32        khronos_uint32_t;
169 | typedef __int64                 khronos_int64_t;
170 | typedef unsigned __int64        khronos_uint64_t;
171 | #define KHRONOS_SUPPORT_INT64   1
172 | #define KHRONOS_SUPPORT_FLOAT   1
173 | 
174 | #elif defined(__sun__) || defined(__digital__)
175 | 
176 | /*
177 |  * Sun or Digital
178 |  */
179 | typedef int                     khronos_int32_t;
180 | typedef unsigned int            khronos_uint32_t;
181 | #if defined(__arch64__) || defined(_LP64)
182 | typedef long int                khronos_int64_t;
183 | typedef unsigned long int       khronos_uint64_t;
184 | #else
185 | typedef long long int           khronos_int64_t;
186 | typedef unsigned long long int  khronos_uint64_t;
187 | #endif /* __arch64__ */
188 | #define KHRONOS_SUPPORT_INT64   1
189 | #define KHRONOS_SUPPORT_FLOAT   1
190 | 
191 | #elif 0
192 | 
193 | /*
194 |  * Hypothetical platform with no float or int64 support
195 |  */
196 | typedef int                     khronos_int32_t;
197 | typedef unsigned int            khronos_uint32_t;
198 | #define KHRONOS_SUPPORT_INT64   0
199 | #define KHRONOS_SUPPORT_FLOAT   0
200 | 
201 | #else
202 | 
203 | /*
204 |  * Generic fallback
205 |  */
206 | #include <stdint.h>
207 | typedef int32_t                 khronos_int32_t;
208 | typedef uint32_t                khronos_uint32_t;
209 | typedef int64_t                 khronos_int64_t;
210 | typedef uint64_t                khronos_uint64_t;
211 | #define KHRONOS_SUPPORT_INT64   1
212 | #define KHRONOS_SUPPORT_FLOAT   1
213 | 
214 | #endif
215 | 
216 | 
217 | /*
218 |  * Types that are (so far) the same on all platforms
219 |  */
220 | typedef signed   char          khronos_int8_t;
221 | typedef unsigned char          khronos_uint8_t;
222 | typedef signed   short int     khronos_int16_t;
223 | typedef unsigned short int     khronos_uint16_t;
224 | 
225 | /*
226 |  * Types that differ between LLP64 and LP64 architectures - in LLP64,
227 |  * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
228 |  * to be the only LLP64 architecture in current use.
229 |  */
230 | #ifdef _WIN64
231 | typedef signed   long long int khronos_intptr_t;
232 | typedef unsigned long long int khronos_uintptr_t;
233 | typedef signed   long long int khronos_ssize_t;
234 | typedef unsigned long long int khronos_usize_t;
235 | #else
236 | typedef signed   long  int     khronos_intptr_t;
237 | typedef unsigned long  int     khronos_uintptr_t;
238 | typedef signed   long  int     khronos_ssize_t;
239 | typedef unsigned long  int     khronos_usize_t;
240 | #endif
241 | 
242 | #if KHRONOS_SUPPORT_FLOAT
243 | /*
244 |  * Float type
245 |  */
246 | typedef          float         khronos_float_t;
247 | #endif
248 | 
249 | #if KHRONOS_SUPPORT_INT64
250 | /* Time types
251 |  *
252 |  * These types can be used to represent a time interval in nanoseconds or
253 |  * an absolute Unadjusted System Time.  Unadjusted System Time is the number
254 |  * of nanoseconds since some arbitrary system event (e.g. since the last
255 |  * time the system booted).  The Unadjusted System Time is an unsigned
256 |  * 64 bit value that wraps back to 0 every 584 years.  Time intervals
257 |  * may be either signed or unsigned.
258 |  */
259 | typedef khronos_uint64_t       khronos_utime_nanoseconds_t;
260 | typedef khronos_int64_t        khronos_stime_nanoseconds_t;
261 | #endif
262 | 
263 | /*
264 |  * Dummy value used to pad enum types to 32 bits.
265 |  */
266 | #ifndef KHRONOS_MAX_ENUM
267 | #define KHRONOS_MAX_ENUM 0x7FFFFFFF
268 | #endif
269 | 
270 | /*
271 |  * Enumerated boolean type
272 |  *
273 |  * Values other than zero should be considered to be true.  Therefore
274 |  * comparisons should not be made against KHRONOS_TRUE.
275 |  */
276 | typedef enum {
277 |     KHRONOS_FALSE = 0,
278 |     KHRONOS_TRUE  = 1,
279 |     KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM
280 | } khronos_boolean_enum_t;
281 | 
282 | #endif /* __khrplatform_h_ */
283 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #include "glad/glad.h"
  3 | #endif
  4 | 
  5 | #include <GLFW/glfw3.h>
  6 | 
  7 | #include "World.h"
  8 | #include "Configuration.h"
  9 | 
 10 | #include "base/WorkQueue.h"
 11 | 
 12 | #include "microprofile/microprofile.h"
 13 | #include "microprofile/microprofileui.h"
 14 | #include "microprofile/microprofiledraw.h"
 15 | 
 16 | struct Vertex
 17 | {
 18 |     Vector2f position;
 19 |     unsigned char r, g, b, a;
 20 | 
 21 | };
 22 | 
 23 | void RenderBox(std::vector<Vertex>& vertices, Coords2f coords, Vector2f size, int r, int g, int b, int a)
 24 | {
 25 |     Vector2f axisX = coords.xVector * size.x;
 26 |     Vector2f axisY = coords.yVector * size.y;
 27 | 
 28 |     Vertex v;
 29 | 
 30 |     v.r = r;
 31 |     v.g = g;
 32 |     v.b = b;
 33 |     v.a = a;
 34 | 
 35 |     v.position = coords.pos - axisX - axisY;
 36 |     vertices.push_back(v);
 37 | 
 38 |     v.position = coords.pos + axisX - axisY;
 39 |     vertices.push_back(v);
 40 | 
 41 |     v.position = coords.pos + axisX + axisY;
 42 |     vertices.push_back(v);
 43 | 
 44 |     v.position = coords.pos - axisX + axisY;
 45 |     vertices.push_back(v);
 46 | }
 47 | 
 48 | float random(float min, float max)
 49 | {
 50 |     return min + (max - min) * (float(rand()) / float(RAND_MAX));
 51 | }
 52 | 
 53 | const struct
 54 | {
 55 |     Configuration::SolveMode mode;
 56 |     const char* name;
 57 | } kSolveModes[] =
 58 | {
 59 |    {Configuration::Solve_Scalar, "Scalar"},
 60 | 
 61 | #ifdef __SSE2__
 62 |    {Configuration::Solve_SSE2, "SSE2"},
 63 | #endif
 64 | 
 65 | #ifdef __AVX2__
 66 |    {Configuration::Solve_AVX2, "AVX2"},
 67 | #endif
 68 | };
 69 | 
 70 | const struct
 71 | {
 72 |     Configuration::IslandMode mode;
 73 |     const char* name;
 74 | } kIslandModes[] =
 75 | {
 76 |    {Configuration::Island_Single, "Single"},
 77 |    {Configuration::Island_Multiple, "Multiple"},
 78 |    {Configuration::Island_SingleSloppy, "Single Sloppy"},
 79 |    {Configuration::Island_MultipleSloppy, "Multiple Sloppy"},
 80 | };
 81 | 
 82 | const char* resetWorld(World& world, int scene)
 83 | {
 84 |     MICROPROFILE_SCOPEI("Init", "resetWorld", -1);
 85 | 
 86 |     world.bodies.clear();
 87 |     world.collider.manifolds.clear();
 88 |     world.collider.manifoldMap.clear();
 89 |     world.solver.contactJoints.clear();
 90 | 
 91 |     RigidBody* groundBody = world.AddBody(Coords2f(Vector2f(0, 0), 0.0f), Vector2f(10000.f, 10.0f));
 92 |     groundBody->invInertia = 0.0f;
 93 |     groundBody->invMass = 0.0f;
 94 | 
 95 |     world.AddBody(Coords2f(Vector2f(-1000, 1500), 0.0f), Vector2f(30.0f, 30.0f));
 96 | 
 97 |     switch (scene % 8)
 98 |     {
 99 |     case 0:
100 |     {
101 |         for (int bodyIndex = 0; bodyIndex < 20000; bodyIndex++)
102 |         {
103 |             Vector2f pos = Vector2f(random(-500.0f, 500.0f), random(50.f, 1000.0f));
104 |             Vector2f size(4.f, 4.f);
105 | 
106 |             world.AddBody(Coords2f(pos, 0.f), size);
107 |         }
108 | 
109 |         return "Falling";
110 |     }
111 | 
112 |     case 1:
113 |     {
114 |         for (int left = -100; left <= 100; left++)
115 |         {
116 |             for (int bodyIndex = 0; bodyIndex < 100; bodyIndex++)
117 |             {
118 |                 Vector2f pos = Vector2f(left * 20, 10 + bodyIndex * 10);
119 |                 Vector2f size(10, 5);
120 | 
121 |                 world.AddBody(Coords2f(pos, 0.f), size);
122 |             }
123 |         }
124 | 
125 |         return "Wall";
126 |     }
127 | 
128 |     case 2:
129 |     {
130 |         for (int step = 0; step < 100; ++step)
131 |         {
132 |             Vector2f pos = Vector2f(0, 1005 - step * 10);
133 |             Vector2f size(10 + step * 5, 5);
134 | 
135 |             world.AddBody(Coords2f(pos, 0.f), size);
136 |         }
137 | 
138 |         return "Pyramid";
139 |     }
140 | 
141 |     case 3:
142 |     {
143 |         for (int step = 0; step < 100; ++step)
144 |         {
145 |             Vector2f pos = Vector2f(0, 15 + step * 10);
146 |             Vector2f size(10 + step * 5, 5);
147 | 
148 |             world.AddBody(Coords2f(pos, 0.f), size);
149 |         }
150 | 
151 |         return "Reverse Pyramid";
152 |     }
153 | 
154 |     case 4:
155 |     {
156 |         for (int left = -100; left <= 100; left++)
157 |         {
158 |             for (int bodyIndex = 0; bodyIndex < 150; bodyIndex++)
159 |             {
160 |                 Vector2f pos = Vector2f(left * 15, 15 + bodyIndex * 10);
161 |                 Vector2f size(5 - bodyIndex * 0.03f, 5);
162 | 
163 |                 world.AddBody(Coords2f(pos, 0.f), size);
164 |             }
165 |         }
166 | 
167 |         return "Stacks";
168 |     }
169 | 
170 |     case 5:
171 |     {
172 |         world.AddBody(Coords2f(Vector2f(0.f, 400.f), 0.f), Vector2f(600.f, 10.f))->invMass = 0.f;
173 |         world.AddBody(Coords2f(Vector2f(800.f, 200.f), 0.f), Vector2f(400.f, 10.f))->invMass = 0.f;
174 | 
175 |         for (int bodyIndex = 0; bodyIndex < 20000; bodyIndex++)
176 |         {
177 |             Vector2f pos = Vector2f(random(0.0f, 500.0f), random(500.f, 2500.0f));
178 |             Vector2f size(4.f, 4.f);
179 | 
180 |             world.AddBody(Coords2f(pos, 0.f), size);
181 |         }
182 | 
183 |         return "Stacks";
184 |     }
185 | 
186 |     case 6:
187 |     {
188 |         world.AddBody(Coords2f(Vector2f(0.f, 400.f), 0.f), Vector2f(600.f, 10.f))->invMass = 0.f;
189 |         world.AddBody(Coords2f(Vector2f(800.f, 200.f), 0.f), Vector2f(400.f, 10.f))->invMass = 0.f;
190 | 
191 |         RigidBody* body = world.AddBody(Coords2f(Vector2f(500.f, 500.f), -0.5f), Vector2f(600.f, 10.f));
192 |         body->invMass = 0.f;
193 |         body->invInertia = 0.f;
194 | 
195 |         for (int bodyIndex = 0; bodyIndex < 10000; bodyIndex++)
196 |         {
197 |             Vector2f pos1 = Vector2f(random(200.0f, 500.0f), random(500.f, 2500.0f));
198 |             Vector2f pos2 = Vector2f(random(-500.0f, -200.0f), random(500.f, 2500.0f));
199 |             Vector2f size(4.f, 4.f);
200 | 
201 |             world.AddBody(Coords2f(pos1, 0.f), size);
202 |             world.AddBody(Coords2f(pos2, 0.f), size);
203 |         }
204 | 
205 |         return "Dual Stacks";
206 |     }
207 | 
208 |     case 7:
209 |     {
210 |         for (int group = -5; group <= 5; ++group)
211 |         {
212 |             RigidBody* splitter = world.AddBody(Coords2f(Vector2f(group * 300, 500.f), 0.f), Vector2f(20.f, 1000.f));
213 |             splitter->invMass = 0.f;
214 |             splitter->invInertia = 0.f;
215 | 
216 |             for (int bodyIndex = 0; bodyIndex < 4500; bodyIndex++)
217 |             {
218 |                 Vector2f pos = Vector2f(group * 300 + random(50.f, 250.0f), random(50.f, 1500.0f));
219 |                 Vector2f size(4.f, 4.f);
220 | 
221 |                 world.AddBody(Coords2f(pos, 0.f), size);
222 |             }
223 |         }
224 | 
225 |         return "Islands";
226 |     }
227 |     }
228 | 
229 |     return "Empty";
230 | }
231 | 
232 | bool keyPressed[GLFW_KEY_LAST + 1];
233 | int mouseScrollDelta = 0;
234 | 
235 | static void errorCallback(int error, const char* description)
236 | {
237 |     fputs(description, stderr);
238 | }
239 | 
240 | static void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods)
241 | {
242 |     keyPressed[key] = (action == GLFW_PRESS);
243 | }
244 | 
245 | static void scrollCallback(GLFWwindow* window, double x, double y)
246 | {
247 |     mouseScrollDelta = y;
248 | }
249 | 
250 | int main(int argc, char** argv)
251 | {
252 |     MicroProfileOnThreadCreate("Main");
253 |     MicroProfileSetEnableAllGroups(true);
254 |     MicroProfileSetForceMetaCounters(true);
255 | 
256 |     int windowWidth = 1280, windowHeight = 1024;
257 | 
258 |     std::unique_ptr<WorkQueue> queue(new WorkQueue(WorkQueue::getIdealWorkerCount() - 1));
259 | 
260 |     World world;
261 | 
262 |     int currentSolveMode = sizeof(kSolveModes) / sizeof(kSolveModes[0]) - 1;
263 |     int currentIslandMode = sizeof(kIslandModes) / sizeof(kIslandModes[0]) - 1;
264 |     int currentScene = 0;
265 | 
266 |     const char* currentSceneName = resetWorld(world, currentScene);
267 | 
268 |     const float gravity = -200.0f;
269 |     const float integrationTime = 1 / 60.f;
270 | 
271 |     world.gravity = gravity;
272 | 
273 |     glfwSetErrorCallback(errorCallback);
274 | 
275 |     if (!glfwInit()) return 1;
276 | 
277 |     GLFWwindow* window = glfwCreateWindow(windowWidth, windowHeight, "PhyX", NULL, NULL);
278 |     if (!window) return 1;
279 | 
280 |     glfwMakeContextCurrent(window);
281 |     glfwSwapInterval(1);
282 |     glfwSetKeyCallback(window, keyCallback);
283 |     glfwSetScrollCallback(window, scrollCallback);
284 | 
285 | #ifdef _WIN32
286 | 	if (!gladLoadGL()) return 2;
287 | #endif
288 | 
289 |     MicroProfileDrawInitGL();
290 |     MicroProfileGpuInitGL();
291 | 
292 |     bool paused = false;
293 | 
294 |     double prevUpdateTime = 0.0f;
295 | 
296 |     std::vector<Vertex> vertices;
297 | 
298 |     float viewOffsetX = -500;
299 |     float viewOffsetY = -40;
300 |     float viewScale = 0.5f;
301 | 
302 |     int frameIndex = 0;
303 | 
304 |     while (!glfwWindowShouldClose(window))
305 |     {
306 |         MicroProfileFlip();
307 | 
308 |         MICROPROFILE_SCOPEI("MAIN", "Frame", 0xffee00);
309 | 
310 |         MICROPROFILE_LABELF("MAIN", "Index %d", frameIndex++);
311 | 
312 |         int width, height;
313 |         glfwGetWindowSize(window, &width, &height);
314 | 
315 |         int frameWidth, frameHeight;
316 |         glfwGetFramebufferSize(window, &frameWidth, &frameHeight);
317 | 
318 |         double mouseX, mouseY;
319 |         glfwGetCursorPos(window, &mouseX, &mouseY);
320 | 
321 |         glViewport(0, 0, frameWidth, frameHeight);
322 |         glClearColor(0.2f, 0.2f, 0.2f, 1.f);
323 |         glClear(GL_COLOR_BUFFER_BIT);
324 | 
325 |         glMatrixMode(GL_PROJECTION);
326 |         glLoadIdentity();
327 |         glOrtho(viewOffsetX / viewScale, width / viewScale + viewOffsetX / viewScale, viewOffsetY / viewScale, height / viewScale + viewOffsetY / viewScale, 1.f, -1.f);
328 | 
329 |         vertices.clear();
330 | 
331 |         if (glfwGetTime() > prevUpdateTime + integrationTime)
332 |         {
333 |             prevUpdateTime += integrationTime;
334 | 
335 |             if (!paused)
336 |             {
337 |                 RigidBody* draggedBody = &world.bodies[1];
338 |                 Vector2f dragTarget =
339 |                     glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_RIGHT)
340 |                     ? Vector2f(mouseX + viewOffsetX, height + viewOffsetY - mouseY) / viewScale
341 |                     : draggedBody->coords.pos;
342 | 
343 |                 Vector2f dstVelocity = (dragTarget - draggedBody->coords.pos) * 5e1f;
344 | 
345 |                 draggedBody->acceleration.y -= gravity;
346 |                 draggedBody->acceleration += (dstVelocity - draggedBody->velocity) * 5e0;
347 | 
348 |                 Configuration config = { kSolveModes[currentSolveMode].mode, kIslandModes[currentIslandMode].mode, 15, 15 };
349 |                 world.Update(*queue, integrationTime, config);
350 |             }
351 |         }
352 | 
353 |         char stats[256];
354 |         sprintf(stats, "Scene: %s | Bodies: %d Manifolds: %d Contacts: %d Islands: %d (biggest: %d) | Cores: %d; Solve: %s; Island: %s; Iterations: %.2f",
355 |             currentSceneName,
356 |             int(world.bodies.size),
357 |             int(world.collider.manifolds.size),
358 |             int(world.solver.contactJoints.size),
359 |             int(world.solver.islandCount),
360 |             int(world.solver.islandMaxSize),
361 |             int(queue->getWorkerCount() + 1),
362 |             kSolveModes[currentSolveMode].name,
363 |             kIslandModes[currentIslandMode].name,
364 |             0.f);
365 | 
366 |         {
367 |             MICROPROFILE_SCOPEI("Render", "Render", 0xff0000);
368 | 
369 |             {
370 |                 MICROPROFILE_SCOPEI("Render", "Prepare", -1);
371 | 
372 |                 for (int bodyIndex = 0; bodyIndex < world.bodies.size; bodyIndex++)
373 |                 {
374 |                     RigidBody* body = &world.bodies[bodyIndex];
375 |                     Coords2f bodyCoords = body->coords;
376 |                     Vector2f size = body->geom.size;
377 | 
378 |                     float colorMult = float(bodyIndex) / float(world.bodies.size) * 0.5f + 0.5f;
379 |                     int r = 50 * colorMult;
380 |                     int g = 125 * colorMult;
381 |                     int b = 218 * colorMult;
382 | 
383 |                     if (bodyIndex == 1) //dragged body
384 |                     {
385 |                         r = 242;
386 |                         g = 236;
387 |                         b = 164;
388 |                     }
389 | 
390 |                     RenderBox(vertices, bodyCoords, size, r, g, b, 255);
391 |                 }
392 | 
393 |                 if (glfwGetKey(window, GLFW_KEY_V))
394 |                 {
395 |                     for (int manifoldIndex = 0; manifoldIndex < world.collider.manifolds.size; manifoldIndex++)
396 |                     {
397 |                         Manifold& man = world.collider.manifolds[manifoldIndex];
398 | 
399 |                         for (int collisionNumber = 0; collisionNumber < man.pointCount; collisionNumber++)
400 |                         {
401 |                             ContactPoint& cp = world.collider.contactPoints[man.pointIndex + collisionNumber];
402 | 
403 |                             Coords2f coords = Coords2f(Vector2f(0.0f, 0.0f), 3.1415f / 4.0f);
404 | 
405 |                             coords.pos = world.bodies[man.body1Index].coords.pos + cp.delta1;
406 | 
407 |                             float redMult = cp.isNewlyCreated ? 0.5f : 1.0f;
408 | 
409 |                             RenderBox(vertices, coords, Vector2f(3.0f, 3.0f), 100, 100 * redMult, 100 * redMult, 100);
410 | 
411 |                             coords.pos = world.bodies[man.body2Index].coords.pos + cp.delta2;
412 | 
413 |                             RenderBox(vertices, coords, Vector2f(3.0f, 3.0f), 150, 150 * redMult, 150 * redMult, 100);
414 |                         }
415 |                     }
416 |                 }
417 |             }
418 | 
419 |             {
420 |                 MICROPROFILE_SCOPEI("Render", "Perform", -1);
421 |                 MICROPROFILE_SCOPEGPUI("Scene", -1);
422 | 
423 |                 if (!vertices.empty())
424 |                 {
425 |                     glEnableClientState(GL_VERTEX_ARRAY);
426 |                     glEnableClientState(GL_COLOR_ARRAY);
427 | 
428 |                     glVertexPointer(2, GL_FLOAT, sizeof(Vertex), &vertices[0].position);
429 |                     glColorPointer(4, GL_UNSIGNED_BYTE, sizeof(Vertex), &vertices[0].r);
430 | 
431 |                     glDrawArrays(GL_QUADS, 0, vertices.size());
432 | 
433 |                     glDisableClientState(GL_VERTEX_ARRAY);
434 |                     glDisableClientState(GL_COLOR_ARRAY);
435 |                 }
436 |             }
437 | 
438 |             {
439 |                 MICROPROFILE_SCOPEI("Render", "Profile", -1);
440 |                 MICROPROFILE_SCOPEGPUI("Profile", -1);
441 | 
442 |                 MicroProfileBeginDraw(width, height, 1.f);
443 | 
444 |                 MicroProfileDraw(width, height);
445 |                 MicroProfileDrawText(2, height - 12, 0xffffffff, stats, strlen(stats));
446 | 
447 |                 MicroProfileEndDraw();
448 |             }
449 |         }
450 | 
451 |         MICROPROFILE_COUNTER_ADD("frame/count", 1);
452 | 
453 |         {
454 |             MICROPROFILE_SCOPEI("MAIN", "Flip", 0xffee00);
455 | 
456 |             glfwSwapBuffers(window);
457 |         }
458 | 
459 |         {
460 |             MICROPROFILE_SCOPEI("MAIN", "Input", 0xffee00);
461 | 
462 |             // Handle input
463 |             memset(keyPressed, 0, sizeof(keyPressed));
464 |             mouseScrollDelta = 0;
465 | 
466 |             glfwPollEvents();
467 | 
468 |             bool mouseDown0 = glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_LEFT) == GLFW_PRESS;
469 |             bool mouseDown1 = glfwGetMouseButton(window, GLFW_MOUSE_BUTTON_RIGHT) == GLFW_PRESS;
470 | 
471 |             MicroProfileMouseButton(mouseDown0, mouseDown1);
472 |             MicroProfileMousePosition(mouseX, mouseY, mouseScrollDelta);
473 |             MicroProfileModKey(glfwGetKey(window, GLFW_KEY_LEFT_SHIFT) == GLFW_PRESS);
474 | 
475 |             if (keyPressed[GLFW_KEY_ESCAPE])
476 |                 break;
477 | 
478 |             if (keyPressed[GLFW_KEY_O])
479 |                 MicroProfileToggleDisplayMode();
480 | 
481 |             if (keyPressed[GLFW_KEY_P])
482 |             {
483 |                 paused = !paused;
484 |                 MicroProfileTogglePause();
485 |             }
486 | 
487 |             if (keyPressed[GLFW_KEY_I])
488 |                 currentIslandMode = (currentIslandMode + 1) % (sizeof(kIslandModes) / sizeof(kIslandModes[0]));
489 | 
490 |             if (keyPressed[GLFW_KEY_M])
491 |                 currentSolveMode = (currentSolveMode + 1) % (sizeof(kSolveModes) / sizeof(kSolveModes[0]));
492 | 
493 |             if (keyPressed[GLFW_KEY_R])
494 |                 currentSceneName = resetWorld(world, currentScene);
495 | 
496 |             if (keyPressed[GLFW_KEY_S])
497 |                 currentSceneName = resetWorld(world, ++currentScene);
498 | 
499 |             if (keyPressed[GLFW_KEY_C])
500 |             {
501 |                 unsigned int cores = queue->getWorkerCount() + 1;
502 |                 unsigned int newcores =
503 |                     (cores == WorkQueue::getIdealWorkerCount())
504 |                     ? 1
505 |                     : std::min(cores * 2, WorkQueue::getIdealWorkerCount());
506 | 
507 |                 queue.reset(new WorkQueue(newcores - 1));
508 |             }
509 | 
510 |             if (glfwGetKey(window, GLFW_KEY_LEFT))
511 |                 viewOffsetX -= 10;
512 | 
513 |             if (glfwGetKey(window, GLFW_KEY_RIGHT))
514 |                 viewOffsetX += 10;
515 | 
516 |             if (glfwGetKey(window, GLFW_KEY_UP))
517 |                 viewScale *= 1.05f;
518 | 
519 |             if (glfwGetKey(window, GLFW_KEY_DOWN))
520 |                 viewScale /= 1.05f;
521 |         }
522 |     }
523 | 
524 |     glfwDestroyWindow(window);
525 |     glfwTerminate();
526 | 
527 |     MicroProfileShutdown();
528 | }


--------------------------------------------------------------------------------