├── AllMOCProjects.sln ├── CMakeLists.txt ├── CONTRIBUTING.md ├── CompilerSpecific.inl ├── CullingThreadpool.cpp ├── CullingThreadpool.h ├── D3DValidate ├── D3DValidate.cpp ├── D3DValidate.sln ├── D3DValidate.vcxproj └── D3DValidate.vcxproj.filters ├── Example ├── CMakeLists.txt ├── Example.sln ├── Example.vcxproj ├── Example.vcxproj.filters └── ExampleMain.cpp ├── FillrateTest ├── CMakeLists.txt ├── FillrateTest.cpp ├── FillrateTest.sln ├── FillrateTest.vcxproj └── FillrateTest.vcxproj.filters ├── FrameRecorder.cpp ├── FrameRecorder.h ├── FrameRecorderPlayer ├── FrameRecorderPlayer.cpp ├── FrameRecorderPlayer.sln ├── FrameRecorderPlayer.vcxproj ├── FrameRecorderPlayer.vcxproj.filters ├── OcclusionCulling_0.mocrec ├── OcclusionCulling_1.mocrec └── OcclusionCulling_2.mocrec ├── MaskedOcclusionCulling.cpp ├── MaskedOcclusionCulling.h ├── MaskedOcclusionCullingAVX2.cpp ├── MaskedOcclusionCullingAVX512.cpp ├── MaskedOcclusionCullingCommon.inl ├── README.md ├── StaticLib └── StaticLib.vcxproj └── license.txt /AllMOCProjects.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.27004.2002 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "D3DValidate", "D3DValidate\D3DValidate.vcxproj", "{02474EA7-2575-4AE7-A86C-6125EE7D1F08}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Example", "Example\Example.vcxproj", "{0A343471-42A8-4C29-AD98-9A976C514336}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FillrateTest", "FillrateTest\FillrateTest.vcxproj", "{C4229C47-7922-417C-9931-348CA8750D53}" 11 | EndProject 12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FrameRecorderPlayer", "FrameRecorderPlayer\FrameRecorderPlayer.vcxproj", "{D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}" 13 | EndProject 14 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StaticLib", "StaticLib\StaticLib.vcxproj", "{958E770F-16AB-468D-9DA8-C2DFF91824BB}" 15 | EndProject 16 | Global 17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 18 | Debug_LLVM|x64 = Debug_LLVM|x64 19 | Debug_LLVM|x86 = Debug_LLVM|x86 20 | Debug|x64 = Debug|x64 21 | Debug|x86 = Debug|x86 22 | Release_LLVM|x64 = Release_LLVM|x64 23 | Release_LLVM|x86 = Release_LLVM|x86 24 | Release|x64 = Release|x64 25 | Release|x86 = Release|x86 26 | EndGlobalSection 27 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 28 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 29 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 30 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug_LLVM|x86.ActiveCfg = Debug_LLVM|x64 31 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug|x64.ActiveCfg = Debug|x64 32 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug|x64.Build.0 = Debug|x64 33 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug|x86.ActiveCfg = Debug|x64 34 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 35 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 36 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release_LLVM|x86.ActiveCfg = Release_LLVM|x64 37 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release|x64.ActiveCfg = Release|x64 38 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release|x64.Build.0 = Release|x64 39 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release|x86.ActiveCfg = Release|x64 40 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 41 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 42 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug_LLVM|x86.ActiveCfg = Debug_LLVM|x64 43 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug|x64.ActiveCfg = Debug|x64 44 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug|x64.Build.0 = Debug|x64 45 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug|x86.ActiveCfg = Debug|x64 46 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 47 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 48 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release_LLVM|x86.ActiveCfg = Release_LLVM|x64 49 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release|x64.ActiveCfg = Release|x64 50 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release|x64.Build.0 = Release|x64 51 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release|x86.ActiveCfg = Release|x64 52 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 53 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 54 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug_LLVM|x86.ActiveCfg = Debug_LLVM|x64 55 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug|x64.ActiveCfg = Debug|x64 56 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug|x64.Build.0 = Debug|x64 57 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug|x86.ActiveCfg = Debug|x64 58 | {C4229C47-7922-417C-9931-348CA8750D53}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 59 | {C4229C47-7922-417C-9931-348CA8750D53}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 60 | {C4229C47-7922-417C-9931-348CA8750D53}.Release_LLVM|x86.ActiveCfg = Release_LLVM|x64 61 | {C4229C47-7922-417C-9931-348CA8750D53}.Release|x64.ActiveCfg = Release|x64 62 | {C4229C47-7922-417C-9931-348CA8750D53}.Release|x64.Build.0 = Release|x64 63 | {C4229C47-7922-417C-9931-348CA8750D53}.Release|x86.ActiveCfg = Release|x64 64 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 65 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 66 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug_LLVM|x86.ActiveCfg = Debug_LLVM|x64 67 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug|x64.ActiveCfg = Debug|x64 68 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug|x64.Build.0 = Debug|x64 69 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug|x86.ActiveCfg = Debug|x64 70 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 71 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 72 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release_LLVM|x86.ActiveCfg = Release_LLVM|x64 73 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release|x64.ActiveCfg = Release|x64 74 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release|x64.Build.0 = Release|x64 75 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release|x86.ActiveCfg = Release|x64 76 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug_LLVM|x64.ActiveCfg = Debug|x64 77 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug_LLVM|x64.Build.0 = Debug|x64 78 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug_LLVM|x86.ActiveCfg = Debug|Win32 79 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug_LLVM|x86.Build.0 = Debug|Win32 80 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug|x64.ActiveCfg = Debug|x64 81 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug|x64.Build.0 = Debug|x64 82 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug|x86.ActiveCfg = Debug|Win32 83 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Debug|x86.Build.0 = Debug|Win32 84 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release_LLVM|x64.ActiveCfg = Release|x64 85 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release_LLVM|x64.Build.0 = Release|x64 86 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release_LLVM|x86.ActiveCfg = Release|Win32 87 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release_LLVM|x86.Build.0 = Release|Win32 88 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release|x64.ActiveCfg = Release|x64 89 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release|x64.Build.0 = Release|x64 90 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release|x86.ActiveCfg = Release|Win32 91 | {958E770F-16AB-468D-9DA8-C2DFF91824BB}.Release|x86.Build.0 = Release|Win32 92 | EndGlobalSection 93 | GlobalSection(SolutionProperties) = preSolution 94 | HideSolutionNode = FALSE 95 | EndGlobalSection 96 | GlobalSection(ExtensibilityGlobals) = postSolution 97 | SolutionGuid = {4EE1E6EF-D6F0-4F01-B18A-433CFE41B240} 98 | EndGlobalSection 99 | EndGlobal 100 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # CMake file for the masked occlusion culling library 3 | # 4 | set(CMAKE_SUPPRESS_REGENERATION true) 5 | option(USE_AVX512 "Enable experimental AVX-512 support" OFF) 6 | set(CMAKE_CONFIGURATION_TYPES Debug Release) 7 | 8 | # 9 | # Lists of all files included in the library 10 | # 11 | set( MOC_AVX512_FILES MaskedOcclusionCullingAVX512.cpp ) 12 | set( MOC_AVX2_FILES MaskedOcclusionCullingAVX2.cpp ) 13 | set( MOC_SSE_FILES MaskedOcclusionCulling.cpp CullingThreadpool.cpp ) 14 | set( MOC_INCLUDE_FILES MaskedOcclusionCulling.h CullingThreadpool.h CompilerSpecific.inl MaskedOcclusionCullingCommon.inl ) 15 | set( MOC_FILES ${MOC_AVX512_FILES} ${MOC_AVX2_FILES} ${MOC_SSE_FILES} ${MOC_INCLUDE_FILES} ) 16 | 17 | # 18 | # Common compiler flags 19 | # 20 | if(MSVC) 21 | if(MSVC_VERSION LESS 1900) 22 | set(CMAKE_CXX_FLAGS "-std=c++11") 23 | endif() 24 | else() 25 | set(CMAKE_CXX_FLAGS "-std=c++11 -m64") 26 | endif() 27 | 28 | if(MSVC) 29 | # 30 | # Setup compiler flags for AVX-512 files (MSVC) 31 | # 32 | 33 | if (USE_AVX512) 34 | SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-DUSE_AVX512=1 /arch:AVX2" ) 35 | else() 36 | SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "/arch:AVX2" ) 37 | endif() 38 | 39 | # 40 | # Setup compiler flags for AVX2 files (MSVC) 41 | # 42 | SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX2_FILES} PROPERTIES COMPILE_FLAGS "/arch:AVX2" ) 43 | 44 | # 45 | # Setup compiler flags for SSE4.1 / SSE2 files (MSVC) 46 | # 47 | if(NOT "${CMAKE_GENERATOR}" MATCHES "(Win64|IA64)") 48 | # SSE2 is always enabled on 64-bit architectures, specifying redundant flag produces a compiler warning 49 | if(MSVC_VERSION LESS 1900) 50 | SET_SOURCE_FILES_PROPERTIES( ${MOC_SSE_FILES} PROPERTIES COMPILE_FLAGS "/arch:SSE2" ) 51 | endif() 52 | endif() 53 | 54 | else() 55 | 56 | # 57 | # Setup compiler flags for AVX-512 files 58 | # 59 | if (USE_AVX512) 60 | SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-DUSE_AVX512=1 -mavx512f -mavx512bw -mavx512dq -mavx2 -mfma -msse4.1" ) 61 | else() 62 | SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -msse4.1" ) 63 | endif() 64 | 65 | # 66 | # Setup compiler flags for AVX2 files 67 | # 68 | SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX2_FILES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -msse4.1" ) 69 | 70 | # 71 | # Setup compiler flags for SSE4.1 / SSE2 files 72 | # 73 | SET_SOURCE_FILES_PROPERTIES( ${MOC_SSE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1" ) 74 | 75 | endif() 76 | 77 | # 78 | # Create masked occlusion culling library 79 | # 80 | add_library( MaskedOcclusionCulling ${MOC_FILES} ) 81 | 82 | # 83 | # Add folder to include path 84 | # 85 | target_include_directories(MaskedOcclusionCulling PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 86 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | We accept contributions as pull requests on GitHub, and will attempt to do so 4 | as quickly as possible. 5 | 6 | Masked Occlusion Culling is licensed under the terms in 7 | [LICENSE](https://github.com/GameTechDev/MaskedOcclusionCulling/blob/master/license.txt). 8 | By contributing to the project, you agree to the license and copyright terms 9 | therein and release your contribution under these terms. 10 | 11 | You must also certify that the contributions adhere to the requirements 12 | outlined in the following Developer Certificate of Origin: 13 | 14 | ``` 15 | Developer Certificate of Origin 16 | Version 1.1 17 | 18 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 19 | 660 York Street, Suite 102, 20 | San Francisco, CA 94110 USA 21 | 22 | Everyone is permitted to copy and distribute verbatim copies of this 23 | license document, but changing it is not allowed. 24 | 25 | Developer's Certificate of Origin 1.1 26 | 27 | By making a contribution to this project, I certify that: 28 | 29 | (a) The contribution was created in whole or in part by me and I 30 | have the right to submit it under the open source license 31 | indicated in the file; or 32 | 33 | (b) The contribution is based upon previous work that, to the best 34 | of my knowledge, is covered under an appropriate open source 35 | license and I have the right under that license to submit that 36 | work with modifications, whether created in whole or in part 37 | by me, under the same open source license (unless I am 38 | permitted to submit under a different license), as indicated 39 | in the file; or 40 | 41 | (c) The contribution was provided directly to me by some other 42 | person who certified (a), (b) or (c) and I have not modified 43 | it. 44 | 45 | (d) I understand and agree that this project and the contribution 46 | are public and that a record of the contribution (including all 47 | personal information I submit with it, including my sign-off) is 48 | maintained indefinitely and may be redistributed consistent with 49 | this project or the open source license(s) involved. 50 | ``` 51 | 52 | To do so, each commit must be signed off by including a line like the following 53 | in your commit message (using your full legal name, and email address): 54 | 55 | Signed-off-by: Joe Smith 56 | 57 | If you set your `user.name` and `user.email` git config accordingly, this line 58 | will be added if you use `git commit -s`. 59 | -------------------------------------------------------------------------------- /CompilerSpecific.inl: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | 17 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 18 | // Common shared include file to hide compiler/os specific functions from the rest of the code. 19 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 20 | 21 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) 22 | #define __MICROSOFT_COMPILER 23 | #endif 24 | 25 | #if defined(_WIN32) && (defined(_MSC_VER) || defined(__INTEL_COMPILER) || defined(__clang__)) // Windows: MSVC / Intel compiler / clang 26 | #include 27 | #include 28 | 29 | #define FORCE_INLINE __forceinline 30 | 31 | FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask) 32 | { 33 | unsigned long idx; 34 | _BitScanForward(&idx, *mask); 35 | *mask &= *mask - 1; 36 | return idx; 37 | } 38 | 39 | FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size) 40 | { 41 | return _aligned_malloc(size, alignment); 42 | } 43 | 44 | FORCE_INLINE void aligned_free(void *ptr) 45 | { 46 | _aligned_free(ptr); 47 | } 48 | 49 | #elif defined(__GNUG__) || defined(__clang__) // G++ or clang 50 | #include 51 | #if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) 52 | #include // memalign 53 | #else 54 | #include // memalign 55 | #endif 56 | #include 57 | #include 58 | #include 59 | 60 | #define FORCE_INLINE inline 61 | 62 | FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask) 63 | { 64 | unsigned long idx; 65 | idx = __builtin_ctzl(*mask); 66 | *mask &= *mask - 1; 67 | return idx; 68 | } 69 | 70 | FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size) 71 | { 72 | return memalign(alignment, size); 73 | } 74 | 75 | FORCE_INLINE void aligned_free(void *ptr) 76 | { 77 | free(ptr); 78 | } 79 | 80 | FORCE_INLINE void __cpuidex(int* cpuinfo, int function, int subfunction) 81 | { 82 | __cpuid_count(function, subfunction, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); 83 | } 84 | 85 | FORCE_INLINE unsigned long long _xgetbv(unsigned int index) 86 | { 87 | unsigned int eax, edx; 88 | __asm__ __volatile__( 89 | "xgetbv;" 90 | : "=a" (eax), "=d"(edx) 91 | : "c" (index) 92 | ); 93 | return ((unsigned long long)edx << 32) | eax; 94 | } 95 | 96 | #else 97 | #error Unsupported compiler 98 | #endif 99 | -------------------------------------------------------------------------------- /CullingThreadpool.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #include 17 | #include "CullingThreadpool.h" 18 | 19 | #define SAFE_DELETE(X) {if (X != nullptr) delete X; X = nullptr;} 20 | #define SAFE_DELETE_ARRAY(X) {if (X != nullptr) delete[] X; X = nullptr;} 21 | 22 | template CullingThreadpool::StateData::StateData(unsigned int maxJobs) : 23 | mMaxJobs(maxJobs), 24 | mCurrentIdx(~0) 25 | { 26 | mData = new T[mMaxJobs]; 27 | } 28 | 29 | template CullingThreadpool::StateData::~StateData() 30 | { 31 | SAFE_DELETE_ARRAY(mData); 32 | } 33 | 34 | template void CullingThreadpool::StateData::AddData(const T &data) 35 | { 36 | mCurrentIdx++; mData[mCurrentIdx % mMaxJobs] = data; 37 | } 38 | 39 | template const T *CullingThreadpool::StateData::GetData() const 40 | { 41 | return &mData[mCurrentIdx % mMaxJobs]; 42 | } 43 | 44 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 45 | // Helper class: Mostly lockless queue for render jobs 46 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 47 | 48 | CullingThreadpool::RenderJobQueue::RenderJobQueue(unsigned int nBins, unsigned int maxJobs) : 49 | mNumBins(nBins), 50 | mMaxJobs(maxJobs) 51 | { 52 | mRenderPtrs = new std::atomic_uint[mNumBins]; 53 | mBinMutexes = new std::atomic_uint[mNumBins]; 54 | for (unsigned int i = 0; i < mNumBins; ++i) 55 | mBinMutexes[i] = 0; 56 | 57 | mJobs = new Job[mMaxJobs]; 58 | for (unsigned int i = 0; i < mMaxJobs; ++i) 59 | mJobs[i].mRenderJobs = new TriList[mNumBins]; 60 | 61 | // Compute worst case job size (we allocate memory for the worst case) 62 | const unsigned int TriSize = 3 * 3; 63 | const unsigned int MaxTrisPerJob = TRIS_PER_JOB * 6; 64 | const unsigned int MaxJobSize = MaxTrisPerJob * TriSize; 65 | mTrilistData = new float[MaxJobSize * mMaxJobs * mNumBins]; 66 | 67 | // Setup trilist objects used for binning 68 | for (unsigned int i = 0; i < mMaxJobs; ++i) 69 | { 70 | for (unsigned int j = 0; j < mNumBins; ++j) 71 | { 72 | int idx = i*mNumBins + j; 73 | TriList &tList = mJobs[i].mRenderJobs[j]; 74 | tList.mNumTriangles = MaxTrisPerJob; 75 | tList.mTriIdx = 0; 76 | tList.mPtr = mTrilistData + idx*MaxJobSize; 77 | } 78 | } 79 | 80 | // Clear render queue 81 | Reset(); 82 | } 83 | 84 | CullingThreadpool::RenderJobQueue::~RenderJobQueue() 85 | { 86 | SAFE_DELETE_ARRAY(mRenderPtrs); 87 | SAFE_DELETE_ARRAY(mBinMutexes); 88 | for (unsigned int i = 0; i < mMaxJobs; ++i) 89 | SAFE_DELETE_ARRAY(mJobs[i].mRenderJobs); 90 | SAFE_DELETE_ARRAY(mJobs); 91 | SAFE_DELETE_ARRAY(mTrilistData); 92 | } 93 | 94 | inline unsigned int CullingThreadpool::RenderJobQueue::GetMinRenderPtr() const 95 | { 96 | unsigned int minRenderPtr = mRenderPtrs[0]; 97 | for (unsigned int i = 1; i < mNumBins; ++i) 98 | { 99 | unsigned int renderPtr = mRenderPtrs[i]; 100 | minRenderPtr = renderPtr < minRenderPtr ? renderPtr : minRenderPtr; 101 | } 102 | return minRenderPtr; 103 | } 104 | 105 | inline void CullingThreadpool::RenderJobQueue::AdvanceRenderJob(int binIdx) 106 | { 107 | mRenderPtrs[binIdx]++; 108 | mBinMutexes[binIdx] = 0; 109 | } 110 | 111 | inline unsigned int CullingThreadpool::RenderJobQueue::GetBestGlobalQueue() const 112 | { 113 | // Find least advanced queue 114 | unsigned int bestBin = ~0, bestPtr = mWritePtr; 115 | for (unsigned int i = 0; i < mNumBins; ++i) 116 | { 117 | if (mRenderPtrs[i] < bestPtr && mBinMutexes[i] == 0) 118 | { 119 | bestBin = i; 120 | bestPtr = mRenderPtrs[i]; 121 | } 122 | } 123 | return bestBin; 124 | } 125 | 126 | inline bool CullingThreadpool::RenderJobQueue::IsPipelineEmpty() const 127 | { 128 | return GetMinRenderPtr() == mWritePtr; 129 | } 130 | 131 | inline bool CullingThreadpool::RenderJobQueue::CanWrite() const 132 | { 133 | return mWritePtr - GetMinRenderPtr() < mMaxJobs; 134 | } 135 | 136 | inline bool CullingThreadpool::RenderJobQueue::CanBin() const 137 | { 138 | return mBinningPtr < mWritePtr && mBinningPtr - GetMinRenderPtr() < mMaxJobs; 139 | } 140 | 141 | inline CullingThreadpool::RenderJobQueue::Job *CullingThreadpool::RenderJobQueue::GetWriteJob() 142 | { 143 | return &mJobs[mWritePtr % mMaxJobs]; 144 | } 145 | 146 | inline void CullingThreadpool::RenderJobQueue::AdvanceWriteJob() 147 | { 148 | mWritePtr++; 149 | } 150 | 151 | inline CullingThreadpool::RenderJobQueue::Job *CullingThreadpool::RenderJobQueue::GetBinningJob() 152 | { 153 | unsigned int binningPtr = mBinningPtr; 154 | if (binningPtr < mWritePtr && binningPtr - GetMinRenderPtr() < mMaxJobs) 155 | { 156 | if (mBinningPtr.compare_exchange_strong(binningPtr, binningPtr + 1)) 157 | { 158 | mJobs[binningPtr % mMaxJobs].mBinningJobStartedIdx = binningPtr; 159 | return &mJobs[binningPtr % mMaxJobs]; 160 | } 161 | } 162 | return nullptr; 163 | } 164 | 165 | inline void CullingThreadpool::RenderJobQueue::FinishedBinningJob(Job *job) 166 | { 167 | job->mBinningJobCompletedIdx = job->mBinningJobStartedIdx; 168 | } 169 | 170 | inline CullingThreadpool::RenderJobQueue::Job *CullingThreadpool::RenderJobQueue::GetRenderJob(int binIdx) 171 | { 172 | // Attempt to lock bin mutex 173 | unsigned int expected = 0; 174 | if (!mBinMutexes[binIdx].compare_exchange_strong(expected, 1)) 175 | return nullptr; 176 | 177 | // Check any items in the queue, and bail if empty 178 | if (mRenderPtrs[binIdx] != mJobs[mRenderPtrs[binIdx] % mMaxJobs].mBinningJobCompletedIdx) 179 | { 180 | mBinMutexes[binIdx] = 0; 181 | return nullptr; 182 | } 183 | 184 | return &mJobs[mRenderPtrs[binIdx] % mMaxJobs]; 185 | } 186 | 187 | void CullingThreadpool::RenderJobQueue::Reset() 188 | { 189 | mWritePtr = 0; 190 | mBinningPtr = 0; 191 | 192 | for (unsigned int i = 0; i < mNumBins; ++i) 193 | mRenderPtrs[i] = 0; 194 | 195 | for (unsigned int i = 0; i < mMaxJobs; ++i) 196 | { 197 | mJobs[i].mBinningJobCompletedIdx = -1; 198 | mJobs[i].mBinningJobStartedIdx = -1; 199 | } 200 | } 201 | 202 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 203 | // Culling threadpool private helper functions 204 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 205 | 206 | void CullingThreadpool::SetupScissors() 207 | { 208 | unsigned int width, height; 209 | mMOC->GetResolution(width, height); 210 | 211 | unsigned int binWidth; 212 | unsigned int binHeight; 213 | mMOC->ComputeBinWidthHeight(mBinsW, mBinsH, binWidth, binHeight); 214 | 215 | for (unsigned int ty = 0; ty < mBinsH; ++ty) 216 | { 217 | for (unsigned int tx = 0; tx < mBinsW; ++tx) 218 | { 219 | unsigned int threadIdx = tx + ty*mBinsW; 220 | 221 | // Adjust rects on final row / col to match resolution 222 | mRects[threadIdx].mMinX = tx*binWidth; 223 | mRects[threadIdx].mMaxX = tx + 1 == mBinsW ? width : (tx + 1) * binWidth; 224 | mRects[threadIdx].mMinY = ty * binHeight; 225 | mRects[threadIdx].mMaxY = ty + 1 == mBinsH ? height : (ty + 1) * binHeight; 226 | } 227 | } 228 | } 229 | 230 | void CullingThreadpool::ThreadRun(CullingThreadpool *threadPool, unsigned int threadId) 231 | { 232 | threadPool->ThreadMain(threadId); 233 | } 234 | 235 | void CullingThreadpool::ThreadMain(unsigned int threadIdx) 236 | { 237 | while (true) 238 | { 239 | bool threadIsIdle = true; 240 | unsigned int threadBinIdx = threadIdx; 241 | 242 | // Wait for threads to be woken up (low CPU load sleep) 243 | std::unique_lock lock(mSuspendedMutex); 244 | mNumSuspendedThreads++; 245 | mSuspendedCV.wait(lock, [&] {return !mSuspendThreads; }); 246 | mNumSuspendedThreads--; 247 | lock.unlock(); 248 | 249 | // Loop until suspended again 250 | while (!mSuspendThreads || !threadIsIdle) 251 | { 252 | if (mKillThreads) 253 | return; 254 | 255 | threadIsIdle = false; 256 | 257 | // Prio 1: Process any render jobs local to this thread 258 | unsigned int binIdx = threadBinIdx; 259 | threadBinIdx = threadBinIdx + mNumThreads < mNumBins ? threadBinIdx + mNumThreads : threadIdx; 260 | RenderJobQueue::Job *job = mRenderQueue->GetRenderJob(binIdx); 261 | if (job != nullptr) 262 | { 263 | if (job->mRenderJobs[binIdx].mTriIdx > 0) 264 | mMOC->RenderTrilist(job->mRenderJobs[binIdx], &mRects[binIdx]); 265 | 266 | mRenderQueue->AdvanceRenderJob(binIdx); 267 | continue; 268 | } 269 | 270 | // Prio 2: Process any outstanding setup/binning jobs 271 | if (mRenderQueue->CanBin()) 272 | { 273 | // If no more rasterization jobs, get next binning job 274 | RenderJobQueue::Job *job = mRenderQueue->GetBinningJob(); 275 | if (job != nullptr) 276 | { 277 | RenderJobQueue::BinningJob &sjob = job->mBinningJob; 278 | for (unsigned int i = 0; i < mNumBins; ++i) 279 | job->mRenderJobs[i].mTriIdx = 0; 280 | mMOC->BinTriangles(sjob.mVerts, sjob.mTris, sjob.nTris, job->mRenderJobs, mBinsW, mBinsH, sjob.mMatrix, sjob.mBfWinding, sjob.mClipPlanes, *sjob.mVtxLayout); 281 | mRenderQueue->FinishedBinningJob(job); 282 | } 283 | continue; 284 | } 285 | 286 | // Prio 3: No work is available, work steal from another thread's queue 287 | if (mNumBins > mNumThreads) 288 | { 289 | binIdx = mRenderQueue->GetBestGlobalQueue(); 290 | if (binIdx < mRenderQueue->mNumBins) 291 | { 292 | RenderJobQueue::Job *job = mRenderQueue->GetRenderJob(binIdx); 293 | if (job != nullptr) 294 | { 295 | if (job->mRenderJobs[binIdx].mTriIdx > 0) 296 | mMOC->RenderTrilist(job->mRenderJobs[binIdx], &mRects[binIdx]); 297 | 298 | mRenderQueue->AdvanceRenderJob(binIdx); 299 | } 300 | continue; 301 | } 302 | } 303 | 304 | // No work available: Yield this thread 305 | std::this_thread::yield(); 306 | threadIsIdle = true; 307 | } 308 | } 309 | } 310 | 311 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 312 | // Culling threadpool public API, similar to the MaskedOcclusionCulling class 313 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 314 | 315 | CullingThreadpool::CullingThreadpool(unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs) : 316 | mNumThreads(numThreads), 317 | mMaxJobs(maxJobs), 318 | mBinsW(binsW), 319 | mBinsH(binsH), 320 | mKillThreads(false), 321 | mSuspendThreads(true), 322 | mNumSuspendedThreads(0), 323 | mModelToClipMatrices(maxJobs), 324 | mVertexLayouts(maxJobs), 325 | mMOC(nullptr) 326 | { 327 | mNumBins = mBinsW*mBinsH; 328 | assert(mNumBins >= mNumThreads); // Having less bins than threads is a bad idea! 329 | 330 | mRects = new ScissorRect[mNumBins]; 331 | mRenderQueue = new RenderJobQueue(mNumBins, mMaxJobs); 332 | 333 | // Add default vertex layout and matrix 334 | mVertexLayouts.AddData(VertexLayout(16, 4, 12)); 335 | mCurrentMatrix = nullptr; 336 | 337 | mThreads = new std::thread[mNumThreads]; 338 | for (unsigned int i = 0; i < mNumThreads; ++i) 339 | mThreads[i] = std::thread(ThreadRun, this, i); 340 | 341 | } 342 | 343 | CullingThreadpool::~CullingThreadpool() 344 | { 345 | // Wait for threads to terminate 346 | if (mThreads != nullptr || !mKillThreads) 347 | { 348 | WakeThreads(); 349 | mKillThreads = true; 350 | for (unsigned int i = 0; i < mNumThreads; ++i) 351 | mThreads[i].join(); 352 | 353 | } 354 | 355 | // Free memory 356 | SAFE_DELETE(mRenderQueue); 357 | SAFE_DELETE_ARRAY(mRects); 358 | SAFE_DELETE_ARRAY(mThreads); 359 | } 360 | 361 | void CullingThreadpool::WakeThreads() 362 | { 363 | // Wait for all threads to be in suspended mode 364 | while (mNumSuspendedThreads < mNumThreads) 365 | std::this_thread::yield(); 366 | 367 | // Send wake up event 368 | std::unique_lock lock(mSuspendedMutex); 369 | mSuspendThreads = false; 370 | lock.unlock(); 371 | mSuspendedCV.notify_all(); 372 | } 373 | 374 | void CullingThreadpool::SuspendThreads() 375 | { 376 | // Signal threads to go into suspended mode (after finishing all outstanding work) 377 | mSuspendThreads = true; 378 | } 379 | 380 | void CullingThreadpool::Flush() 381 | { 382 | // Wait for pipeline to be empty (i.e. all work is finished) 383 | while (!mRenderQueue->IsPipelineEmpty()) 384 | std::this_thread::yield(); 385 | 386 | // Reset queue counters 387 | mRenderQueue->Reset(); 388 | } 389 | 390 | void CullingThreadpool::SetBuffer(MaskedOcclusionCulling *moc) 391 | { 392 | Flush(); 393 | mMOC = moc; 394 | SetupScissors(); 395 | } 396 | 397 | void CullingThreadpool::SetResolution(unsigned int width, unsigned int height) 398 | { 399 | Flush(); 400 | mMOC->SetResolution(width, height); 401 | SetupScissors(); 402 | } 403 | 404 | void CullingThreadpool::SetNearClipPlane(float nearDist) 405 | { 406 | Flush(); 407 | mMOC->SetNearClipPlane(nearDist); 408 | } 409 | 410 | void CullingThreadpool::SetMatrix(const float *modelToClipMatrix) 411 | { 412 | // Treat nullptr matrix as a special case, otherwise copy the contents of the pointer and add to state 413 | if (modelToClipMatrix == nullptr) 414 | mCurrentMatrix = nullptr; 415 | else 416 | { 417 | mModelToClipMatrices.AddData(Matrix4x4(modelToClipMatrix)); 418 | mCurrentMatrix = mModelToClipMatrices.GetData()->mValues; 419 | } 420 | } 421 | 422 | void CullingThreadpool::SetVertexLayout(const VertexLayout &vtxLayout) 423 | { 424 | mVertexLayouts.AddData(vtxLayout); 425 | } 426 | 427 | void CullingThreadpool::ClearBuffer() 428 | { 429 | Flush(); 430 | mMOC->ClearBuffer(); 431 | } 432 | 433 | void CullingThreadpool::RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask) 434 | { 435 | #if MOC_RECORDER_ENABLE != 0 436 | mMOC->RecordRenderTriangles( inVtx, inTris, nTris, mCurrentMatrix, clipPlaneMask, bfWinding, *mVertexLayouts.GetData( ) ); 437 | #endif 438 | 439 | for (int i = 0; i < nTris; i += TRIS_PER_JOB) 440 | { 441 | // Yield if work queue is full 442 | while (!mRenderQueue->CanWrite()) 443 | std::this_thread::yield(); 444 | 445 | // Create new renderjob 446 | RenderJobQueue::Job *job = mRenderQueue->GetWriteJob(); 447 | job->mBinningJob.mVerts = inVtx; 448 | job->mBinningJob.mTris = inTris + i * 3; 449 | job->mBinningJob.nTris = nTris - i < TRIS_PER_JOB ? nTris - i : TRIS_PER_JOB; 450 | job->mBinningJob.mMatrix = mCurrentMatrix; 451 | job->mBinningJob.mClipPlanes = clipPlaneMask; 452 | job->mBinningJob.mBfWinding = bfWinding; 453 | job->mBinningJob.mVtxLayout = mVertexLayouts.GetData(); 454 | mRenderQueue->AdvanceWriteJob(); 455 | } 456 | } 457 | 458 | CullingThreadpool::CullingResult CullingThreadpool::TestRect(float xmin, float ymin, float xmax, float ymax, float wmin) 459 | { 460 | return mMOC->TestRect(xmin, ymin, xmax, ymax, wmin); 461 | } 462 | 463 | CullingThreadpool::CullingResult CullingThreadpool::TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask) 464 | { 465 | return mMOC->TestTriangles(inVtx, inTris, nTris, mCurrentMatrix, bfWinding, clipPlaneMask, *mVertexLayouts.GetData()); 466 | } 467 | 468 | void CullingThreadpool::ComputePixelDepthBuffer(float *depthData, bool flipY) 469 | { 470 | Flush(); 471 | mMOC->ComputePixelDepthBuffer(depthData, flipY); 472 | } 473 | -------------------------------------------------------------------------------- /CullingThreadpool.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #pragma once 17 | 18 | /*! 19 | * \file CullingThreadpool.h 20 | * \brief Worker threadpool example for threaded masked occlusion culling. 21 | * 22 | * This class implements a threadpool for occluder rendering. Calls to CullingThreadpool::RenderTriangle() 23 | * will immediately return, after adding work items to a queue, and occluder rendering is performed 24 | * by worker threads as quickly as possible. Occlusion queries are performed directly on the calling 25 | * threadand can be performed either synchronosly, by calling Flush() before executing the query, or 26 | * asynchronosly, by performing the query without waiting for the worker threads to finish. 27 | * 28 | * Note that this implementation should be considered an example rather than the best threading 29 | * solution. You may want to integrate threading in your own task system, and it may also be beneficial 30 | * to thread the traversal code. Refer to MaskedOcclusionCulling::BinTriangles() and 31 | * MaskedOcclusionCulling::RenderTrilist() for functions that can be used to make your own 32 | * threaded culling system. 33 | */ 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include "MaskedOcclusionCulling.h" 41 | 42 | class CullingThreadpool 43 | { 44 | protected: 45 | static const int TRIS_PER_JOB = 1024; // Maximum number of triangles per job (bigger drawcalls are split), affects memory requirements 46 | 47 | typedef MaskedOcclusionCulling::CullingResult CullingResult; 48 | typedef MaskedOcclusionCulling::ClipPlanes ClipPlanes; 49 | typedef MaskedOcclusionCulling::BackfaceWinding BackfaceWinding; 50 | typedef MaskedOcclusionCulling::ScissorRect ScissorRect; 51 | typedef MaskedOcclusionCulling::VertexLayout VertexLayout; 52 | typedef MaskedOcclusionCulling::TriList TriList; 53 | 54 | // Small utility class for 4x4 matrices 55 | struct Matrix4x4 56 | { 57 | float mValues[16]; 58 | Matrix4x4() {} 59 | Matrix4x4(const float *matrix) 60 | { 61 | for (int i = 0; i < 16; ++i) 62 | mValues[i] = matrix[i]; 63 | } 64 | }; 65 | 66 | // Internal utility class for a (mostly) lockless queue for binning & rendering jobs 67 | struct RenderJobQueue 68 | { 69 | struct BinningJob 70 | { 71 | const float* mVerts; 72 | const unsigned int* mTris; 73 | unsigned int nTris; 74 | 75 | const float* mMatrix; 76 | ClipPlanes mClipPlanes; 77 | BackfaceWinding mBfWinding; 78 | const VertexLayout* mVtxLayout; 79 | }; 80 | 81 | struct Job 82 | { 83 | volatile unsigned int mBinningJobStartedIdx; 84 | volatile unsigned int mBinningJobCompletedIdx; 85 | BinningJob mBinningJob; 86 | TriList *mRenderJobs; 87 | }; 88 | 89 | unsigned int mNumBins; 90 | unsigned int mMaxJobs; 91 | 92 | volatile unsigned int mWritePtr; 93 | std::atomic_uint mBinningPtr; 94 | std::atomic_uint *mRenderPtrs; 95 | std::atomic_uint *mBinMutexes; 96 | 97 | float *mTrilistData; 98 | Job *mJobs; 99 | 100 | RenderJobQueue(unsigned int nBins, unsigned int maxJobs); 101 | ~RenderJobQueue(); 102 | 103 | unsigned int GetMinRenderPtr() const; 104 | unsigned int GetBestGlobalQueue() const; 105 | bool IsPipelineEmpty() const; 106 | 107 | bool CanWrite() const; 108 | bool CanBin() const; 109 | 110 | Job *GetWriteJob(); 111 | void AdvanceWriteJob(); 112 | 113 | Job *GetBinningJob(); 114 | void FinishedBinningJob(Job *job); 115 | 116 | Job *GetRenderJob(int binIdx); 117 | void AdvanceRenderJob(int binIdx); 118 | 119 | void Reset(); 120 | }; 121 | 122 | // Internal utility class for state (matrix / vertex layout) 123 | template struct StateData 124 | { 125 | unsigned int mMaxJobs; 126 | unsigned int mCurrentIdx; 127 | T *mData; 128 | 129 | StateData(unsigned int maxJobs); 130 | ~StateData(); 131 | void AddData(const T &data); 132 | const T *GetData() const; 133 | }; 134 | 135 | // Number of worker threads and bins 136 | unsigned int mNumThreads; 137 | unsigned int mNumBins; 138 | unsigned int mMaxJobs; 139 | unsigned int mBinsW; 140 | unsigned int mBinsH; 141 | 142 | // Threads and control variables 143 | std::mutex mSuspendedMutex; 144 | std::condition_variable mSuspendedCV; 145 | volatile bool mKillThreads; 146 | volatile bool mSuspendThreads; 147 | volatile unsigned int mNumSuspendedThreads; 148 | std::thread *mThreads; 149 | 150 | // State variables and command queue 151 | const float *mCurrentMatrix; 152 | StateData mModelToClipMatrices; 153 | StateData mVertexLayouts; 154 | RenderJobQueue *mRenderQueue; 155 | 156 | // Occlusion culling object and related scissor rectangles 157 | ScissorRect *mRects; 158 | MaskedOcclusionCulling *mMOC; 159 | 160 | void SetupScissors(); 161 | 162 | static void ThreadRun(CullingThreadpool *threadPool, unsigned int threadId); 163 | void ThreadMain(unsigned int threadIdx); 164 | 165 | public: 166 | /*! 167 | * \brief Creates a new threadpool for masked occlusion culling. This object has a 168 | * similar API to the MaskedOcclusionCulling class, but performs occluder 169 | * rendering asynchronously on worker threads (similar to how DX/GL works). 170 | * 171 | * \param numThreads Number of worker threads to perform occluder rendering. Best 172 | * balance may be scene/machine dependent, but it's good practice to leave at 173 | * least one full core (2 threads with hyperthreading) for the main thread. 174 | * \param binsW The screen is divided into binsW x binsH rectangular bins for load 175 | * balancing. The number of bins should be atleast equal to the number of 176 | * worker threads. 177 | * \param binsH See description for the binsW parameter. 178 | * \param maxJobs Maximum number of jobs that may be in flight at any given time. If 179 | * the caller thread generates jobs faster than the worker threads can finish 180 | * them, then the job queue will fill up and the caller thread will stall once 181 | * "maxJobs" items have been queued up. For culling systems interleaving occlusion 182 | * queries and rendering, this value should be kept quite low to minimize false 183 | * positives (see TestRect()). We've observed that 32 [default] items typically 184 | * works well for our interleaved queries, while also allowing good load-balancing, 185 | * and this is the recommended setting. 186 | */ 187 | CullingThreadpool(unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs = 32); 188 | 189 | /*! 190 | * \brief Destroys the threadpool and terminates all worker threads. 191 | */ 192 | ~CullingThreadpool(); 193 | 194 | /*! 195 | * \brief Wakes up culling worker threads from suspended sleep, and puts them in a 196 | * ready state (using an idle spinlock with significantly higher CPU overhead). 197 | * 198 | * It may take on the order of 100us to wake up the threads, so this function should 199 | * preferably be called slightly ahead of starting occlusion culling work. 200 | */ 201 | void WakeThreads(); 202 | 203 | /*! 204 | * \brief Suspend all culling worker threads to a low CPU overhead sleep state. 205 | * 206 | * For performance and latency reasons, the culling work is performed in an active 207 | * processing loop (with no thread sleeping) with high CPU overhead. In a system 208 | * with more worker threads it's important to put the culling worker threads in a 209 | * low overhead sleep state after occlusion culling work has completed. 210 | */ 211 | void SuspendThreads(); 212 | 213 | /*! 214 | * \brief Waits for all outstanding occluder rendering work to complete. Can be used 215 | * to ensure that rendering has completed before performing a TestRect() or 216 | * TestTriangles() call. 217 | */ 218 | void Flush(); 219 | 220 | /* 221 | * \brief Sets the MaskedOcclusionCulling object (buffer) to be used for rendering and 222 | * testing calls. This method causes a Flush() to ensure that all unfinished 223 | * rendering is completed. 224 | */ 225 | void SetBuffer(MaskedOcclusionCulling *moc); 226 | 227 | /* 228 | * \brief Changes the resolution of the occlusion buffer, see MaskedOcclusionCulling::SetResolution(). 229 | * This method causes a Flush() to ensure that all unfinished rendering is completed. 230 | */ 231 | void SetResolution(unsigned int width, unsigned int height); 232 | 233 | /* 234 | * \brief Sets the near clipping plane, see MaskedOcclusionCulling::SetNearClipPlane(). This 235 | * method causes a Flush() to ensure that all unfinished rendering is completed. 236 | */ 237 | void SetNearClipPlane(float nearDist); 238 | 239 | /* 240 | * \brief Sets the model to clipspace transform matrix used for the RenderTriangles() and TestTriangles() 241 | * function calls. The contents of the matrix is copied, and it's safe to modify it without calling 242 | * Flush(). The copy may be costly, which is the reason for passing this parameter as "state". 243 | * 244 | * \param modelToClipMatrix All vertices will be transformed by the specified model to clipspace matrix. 245 | * Passing nullptr [default] disables the transform (equivalent to using an identity matrix). 246 | */ 247 | void SetMatrix(const float *modelToClipMatrix = nullptr); 248 | 249 | /* 250 | * \brief Sets the vertex layout used for the RenderTriangles() and TestTriangles() function calls. 251 | * The vertex layout is copied, and it's safe to modify it without calling Flush(). The copy 252 | * may be costly, which is the reason for passing this parameter as "state". 253 | * 254 | * \param vtxLayout A struct specifying the vertex layout (see struct for detailed 255 | * description). For best performance, it is advicable to store position data 256 | * as compactly in memory as possible. 257 | */ 258 | void SetVertexLayout(const VertexLayout &vtxLayout = VertexLayout(16, 4, 12)); 259 | 260 | /* 261 | * \brief Clears the occlusion buffer, see MaskedOcclusionCulling::ClearBuffer(). This method 262 | * causes a Flush() to ensure that all unfinished rendering is completed. 263 | */ 264 | void ClearBuffer(); 265 | 266 | /* 267 | * \brief Asynchronously render occluder triangles, see MaskedOcclusionCulling::RenderTriangles(). 268 | * 269 | * This method puts the drawcall into a command queue, and immediately returns. The rendering is 270 | * performed by the worker threads at the earliest opportunity. 271 | * 272 | * Important: As rendering is performed asynchronously, the application is not allowed to 273 | * change the contents of the *inVtx or *inTris buffers until after rendering is completed. If 274 | * you wish to use dynamic buffers, the application must perform a Flush() to ensure that rendering 275 | * is finished, or make sure to rotate between more buffers than the maximum number of outstanding 276 | * render jobs (see the CullingThreadpool() constructor). 277 | */ 278 | void RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL); 279 | 280 | /* 281 | * \brief Occlusion query for a rectangle with a given depth, see MaskedOcclusionCulling::TestRect(). 282 | * 283 | * Important: This method is performed on the main thread and does not wait for outstanding 284 | * occluder rendering to be finished. To ensure that all occluder rendering is completed you must 285 | * perform a Flush() prior to calling this function. 286 | * 287 | * It is conservatively correct to perform occlusion queries without calling Flush() (it may only 288 | * lead to objects being incorrectly classified as visible), and it can lead to much better performance 289 | * if occlusion queries are used for traversing a BVH or similar data structure. It's possible to 290 | * use "asynchronous" queries during traversal, and removing false positives later, when rendering 291 | * has completed. 292 | */ 293 | CullingResult TestRect(float xmin, float ymin, float xmax, float ymax, float wmin); 294 | 295 | /* 296 | * \brief Occlusion query for a mesh, see MaskedOcclusionCulling::TestTriangles(). 297 | * 298 | * Important: See the TestRect() method for a brief discussion about asynchronous occlusion 299 | * queries. 300 | */ 301 | CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL); 302 | 303 | /*! 304 | * \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation, see 305 | * MaskedOcclusionCulling::ComputePixelDepthBuffer(). This method causes a Flush() to 306 | * ensure that all unfinished rendering is completed. 307 | */ 308 | void ComputePixelDepthBuffer(float *depthData, bool flipY); 309 | }; 310 | -------------------------------------------------------------------------------- /D3DValidate/D3DValidate.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "../MaskedOcclusionCulling.h" 25 | #include "../CullingThreadpool.h" 26 | 27 | std::mt19937 gRnd; 28 | std::uniform_real_distribution gRndUniform(0, 1); 29 | 30 | //////////////////////////////////////////////////////////////////////////////////////// 31 | // DX 11 functions 32 | //////////////////////////////////////////////////////////////////////////////////////// 33 | 34 | #define NOMINMAX 35 | #include 36 | #include 37 | #include 38 | 39 | #pragma comment (lib, "d3d11.lib") 40 | #pragma comment (lib, "d3dcompiler.lib") 41 | 42 | ID3D11Texture2D *textureZ; 43 | ID3D11Texture2D *textureCol, *staging; 44 | ID3D11DepthStencilView *textureDSV; 45 | ID3D11RenderTargetView *textureRTV; 46 | ID3D11Device *device; 47 | ID3D11DeviceContext *context; 48 | 49 | ID3D11RasterizerState *rastState; 50 | ID3D11DepthStencilState *DSState; 51 | ID3D11InputLayout *layout; 52 | ID3D11VertexShader *VS; 53 | ID3D11PixelShader *PS; 54 | std::vector vBuffers; 55 | ID3D11Query *endQuery; 56 | 57 | #define D3DVERIFY(X) if (X != S_OK) exit(1); 58 | 59 | void InitD3D(unsigned int width, unsigned int height, D3D_DRIVER_TYPE DriverType = D3D_DRIVER_TYPE_HARDWARE) 60 | { 61 | const char *shader = 62 | "float4 VShader(float4 position : POSITION) : SV_POSITION { return position; }" 63 | "float4 PShader(float4 position : SV_POSITION) : SV_TARGET { return 1.0f - position.z; }"; 64 | 65 | D3D_FEATURE_LEVEL fLevel; 66 | D3DVERIFY(D3D11CreateDevice(nullptr, DriverType, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &device, &fLevel, &context)); 67 | 68 | D3D11_TEXTURE2D_DESC tDesc; 69 | tDesc.Width = width; 70 | tDesc.Height = height; 71 | tDesc.MipLevels = tDesc.ArraySize = 1; 72 | tDesc.SampleDesc.Count = 1; 73 | tDesc.SampleDesc.Quality = 0; 74 | tDesc.CPUAccessFlags = 0; 75 | tDesc.MiscFlags = 0; 76 | tDesc.Usage = D3D11_USAGE_DEFAULT; 77 | 78 | tDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 79 | tDesc.BindFlags = D3D11_BIND_RENDER_TARGET; 80 | D3DVERIFY(device->CreateTexture2D(&tDesc, nullptr, &textureCol)); 81 | 82 | tDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; 83 | tDesc.BindFlags = D3D11_BIND_DEPTH_STENCIL; 84 | D3DVERIFY(device->CreateTexture2D(&tDesc, nullptr, &textureZ)); 85 | 86 | tDesc.Usage = D3D11_USAGE_STAGING; 87 | tDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 88 | tDesc.BindFlags = 0; 89 | tDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; 90 | D3DVERIFY(device->CreateTexture2D(&tDesc, nullptr, &staging)); 91 | 92 | D3D11_DEPTH_STENCIL_VIEW_DESC dsvDesc; 93 | dsvDesc.Flags = 0; 94 | dsvDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; 95 | dsvDesc.ViewDimension = D3D11_DSV_DIMENSION_TEXTURE2D; 96 | dsvDesc.Texture2D.MipSlice = 0; 97 | D3DVERIFY(device->CreateDepthStencilView(textureZ, &dsvDesc, &textureDSV)); 98 | 99 | D3D11_RENDER_TARGET_VIEW_DESC rtvDesc; 100 | rtvDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 101 | rtvDesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; 102 | rtvDesc.Texture2D.MipSlice = 0; 103 | D3DVERIFY(device->CreateRenderTargetView(textureCol, &rtvDesc, &textureRTV)); 104 | 105 | // Set the viewport 106 | D3D11_VIEWPORT viewport; 107 | viewport.TopLeftX = 0; 108 | viewport.TopLeftY = 0; 109 | viewport.Width = (float)width; 110 | viewport.Height = (float)height; 111 | viewport.MinDepth = 0; 112 | viewport.MaxDepth = 1; 113 | context->RSSetViewports(1, &viewport); 114 | 115 | // load and compile the two shaders 116 | ID3D10Blob *VSBlob, *PSBlob; 117 | D3DVERIFY(D3DCompile(shader, strlen(shader), "shader", nullptr, nullptr, "VShader", "vs_5_0", 0, 0, &VSBlob, nullptr)); 118 | D3DVERIFY(D3DCompile(shader, strlen(shader), "shader", nullptr, nullptr, "PShader", "ps_5_0", 0, 0, &PSBlob, nullptr)); 119 | 120 | // encapsulate both shaders into shader objects 121 | D3DVERIFY(device->CreateVertexShader(VSBlob->GetBufferPointer(), VSBlob->GetBufferSize(), NULL, &VS)); 122 | D3DVERIFY(device->CreatePixelShader(PSBlob->GetBufferPointer(), PSBlob->GetBufferSize(), NULL, &PS)); 123 | 124 | // set the shader objects 125 | context->VSSetShader(VS, 0, 0); 126 | context->PSSetShader(PS, 0, 0); 127 | 128 | // create the input layout object 129 | D3D11_INPUT_ELEMENT_DESC ied[] = 130 | { 131 | { "POSITION", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 }, 132 | }; 133 | 134 | D3DVERIFY(device->CreateInputLayout(ied, 1, VSBlob->GetBufferPointer(), VSBlob->GetBufferSize(), &layout)); 135 | context->IASetInputLayout(layout); 136 | 137 | D3D11_RASTERIZER_DESC rDesc; 138 | rDesc.FillMode = D3D11_FILL_SOLID; 139 | rDesc.CullMode = D3D11_CULL_NONE; 140 | rDesc.FrontCounterClockwise = false; 141 | rDesc.DepthBias = 0; 142 | rDesc.DepthBiasClamp = 0; 143 | rDesc.SlopeScaledDepthBias = 0; 144 | rDesc.DepthClipEnable = false; 145 | rDesc.ScissorEnable = false; 146 | rDesc.MultisampleEnable = false; 147 | rDesc.AntialiasedLineEnable = false; 148 | device->CreateRasterizerState(&rDesc, &rastState); 149 | context->RSSetState(rastState); 150 | 151 | D3D11_DEPTH_STENCIL_DESC dsDesc; 152 | ZeroMemory(&dsDesc, sizeof(D3D11_DEPTH_STENCIL_DESC)); 153 | dsDesc.DepthEnable = true; 154 | dsDesc.StencilEnable = false; 155 | dsDesc.DepthFunc = D3D11_COMPARISON_LESS_EQUAL; 156 | dsDesc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL; 157 | device->CreateDepthStencilState(&dsDesc, &DSState); 158 | context->OMSetDepthStencilState(DSState, 0); 159 | 160 | D3D11_QUERY_DESC qDesc; 161 | qDesc.Query = D3D11_QUERY_EVENT; 162 | qDesc.MiscFlags = 0; 163 | D3DVERIFY(device->CreateQuery(&qDesc, &endQuery)); 164 | } 165 | 166 | bool D3DValidateTriangle(float *verts, MaskedOcclusionCulling *moc) 167 | { 168 | unsigned int width, height; 169 | moc->GetResolution(width, height); 170 | 171 | //////////////////////////////////////////////////////////////////////////////////////// 172 | // Draw triangle using our framework and read back depth buffer 173 | //////////////////////////////////////////////////////////////////////////////////////// 174 | 175 | // Draw triangle using our framework 176 | unsigned int indices[3] = { 0,1,2 }; 177 | moc->ClearBuffer(); 178 | moc->RenderTriangles(verts, indices, 1); 179 | 180 | // Read back result 181 | float *depthBuffer = new float[width*height]; 182 | moc->ComputePixelDepthBuffer(depthBuffer, false); 183 | 184 | //////////////////////////////////////////////////////////////////////////////////////// 185 | // Draw triangle using DirectX 11 and read back color image 186 | //////////////////////////////////////////////////////////////////////////////////////// 187 | 188 | // Create D3D buffer 189 | ID3D11Buffer *buf = nullptr; 190 | 191 | D3D11_SUBRESOURCE_DATA iData; 192 | iData.pSysMem = verts; 193 | iData.SysMemPitch = 0; 194 | iData.SysMemSlicePitch = 0; 195 | 196 | D3D11_BUFFER_DESC bDesc; 197 | bDesc.Usage = D3D11_USAGE_DEFAULT; 198 | bDesc.ByteWidth = 3 * 4 * sizeof(float); 199 | bDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; 200 | bDesc.CPUAccessFlags = bDesc.MiscFlags = bDesc.StructureByteStride = 0; 201 | D3DVERIFY(device->CreateBuffer(&bDesc, &iData, &buf)); 202 | 203 | // Clear renderbuffers 204 | float clearColor[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; 205 | context->OMSetRenderTargets(1, &textureRTV, textureDSV); 206 | context->ClearRenderTargetView(textureRTV, clearColor); 207 | context->ClearDepthStencilView(textureDSV, D3D11_CLEAR_DEPTH, 1.0f, 0); 208 | 209 | // Draw triangle using D3D 210 | UINT stride = sizeof(float) * 4, offset = 0; 211 | context->IASetVertexBuffers(0, 1, &buf, &stride, &offset); 212 | context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); 213 | context->Draw(3, 0); 214 | 215 | buf->Release(); 216 | 217 | // Read back resulting D3D image 218 | unsigned char *d3dimg = new unsigned char[width * height * 4]; 219 | D3D11_MAPPED_SUBRESOURCE map; 220 | context->CopyResource(staging, textureCol); 221 | context->Map(staging, 0, D3D11_MAP_READ, 0, &map); 222 | memcpy(d3dimg, map.pData, width*height * 4); 223 | context->Unmap(staging, 0); 224 | 225 | //////////////////////////////////////////////////////////////////////////////////////// 226 | // Compare rasterized coverage 227 | //////////////////////////////////////////////////////////////////////////////////////// 228 | 229 | bool identical = true; 230 | for (unsigned int y = 0; y < height; ++y) 231 | { 232 | for (unsigned int x = 0; x < width; ++x) 233 | { 234 | bool d3dcov = (d3dimg[(x + y*width) * 4] + d3dimg[(x + y*width) * 4 + 1] + d3dimg[(x + y*width) * 4 + 2]) != 0; 235 | bool ourcov = depthBuffer[x + y*width] != -1.0f; 236 | 237 | if (d3dcov != ourcov) 238 | identical = false; 239 | } 240 | } 241 | 242 | delete[] depthBuffer; 243 | delete[] d3dimg; 244 | 245 | return identical; 246 | } 247 | 248 | bool D3DValidateTriangleThreaded(float *verts, unsigned int width, unsigned int height, CullingThreadpool *ctp) 249 | { 250 | 251 | //////////////////////////////////////////////////////////////////////////////////////// 252 | // Draw triangle using our framework and read back depth buffer 253 | //////////////////////////////////////////////////////////////////////////////////////// 254 | 255 | // Draw triangle using our framework 256 | unsigned int indices[3] = { 0,1,2 }; 257 | ctp->ClearBuffer(); 258 | ctp->RenderTriangles(verts, indices, 1); 259 | ctp->Flush(); 260 | 261 | 262 | // Read back result 263 | float *depthBuffer = new float[width*height]; 264 | ctp->ComputePixelDepthBuffer(depthBuffer, false); 265 | 266 | //////////////////////////////////////////////////////////////////////////////////////// 267 | // Draw triangle using DirectX 11 and read back color image 268 | //////////////////////////////////////////////////////////////////////////////////////// 269 | 270 | // Create D3D buffer 271 | ID3D11Buffer *buf = nullptr; 272 | 273 | D3D11_SUBRESOURCE_DATA iData; 274 | iData.pSysMem = verts; 275 | iData.SysMemPitch = 0; 276 | iData.SysMemSlicePitch = 0; 277 | 278 | D3D11_BUFFER_DESC bDesc; 279 | bDesc.Usage = D3D11_USAGE_DEFAULT; 280 | bDesc.ByteWidth = 3 * 4 * sizeof(float); 281 | bDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; 282 | bDesc.CPUAccessFlags = bDesc.MiscFlags = bDesc.StructureByteStride = 0; 283 | D3DVERIFY(device->CreateBuffer(&bDesc, &iData, &buf)); 284 | 285 | // Clear renderbuffers 286 | float clearColor[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; 287 | context->OMSetRenderTargets(1, &textureRTV, textureDSV); 288 | context->ClearRenderTargetView(textureRTV, clearColor); 289 | context->ClearDepthStencilView(textureDSV, D3D11_CLEAR_DEPTH, 1.0f, 0); 290 | 291 | // Draw triangle using D3D 292 | UINT stride = sizeof(float) * 4, offset = 0; 293 | context->IASetVertexBuffers(0, 1, &buf, &stride, &offset); 294 | context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); 295 | context->Draw(3, 0); 296 | 297 | buf->Release(); 298 | 299 | // Read back resulting D3D image 300 | unsigned char *d3dimg = new unsigned char[width * height * 4]; 301 | D3D11_MAPPED_SUBRESOURCE map; 302 | context->CopyResource(staging, textureCol); 303 | context->Map(staging, 0, D3D11_MAP_READ, 0, &map); 304 | memcpy(d3dimg, map.pData, width*height * 4); 305 | context->Unmap(staging, 0); 306 | 307 | //////////////////////////////////////////////////////////////////////////////////////// 308 | // Compare rasterized coverage 309 | //////////////////////////////////////////////////////////////////////////////////////// 310 | 311 | bool identical = true; 312 | for (unsigned int y = 0; y < height; ++y) 313 | { 314 | for (unsigned int x = 0; x < width; ++x) 315 | { 316 | bool d3dcov = (d3dimg[(x + y*width) * 4] + d3dimg[(x + y*width) * 4 + 1] + d3dimg[(x + y*width) * 4 + 2]) != 0; 317 | bool ourcov = depthBuffer[x + y*width] != -1.0f; 318 | 319 | if (d3dcov != ourcov) 320 | identical = false; 321 | } 322 | } 323 | 324 | delete[] depthBuffer; 325 | delete[] d3dimg; 326 | 327 | return identical; 328 | } 329 | 330 | //////////////////////////////////////////////////////////////////////////////////////// 331 | // Random triangle generator 332 | //////////////////////////////////////////////////////////////////////////////////////// 333 | 334 | void RandomTriangle(float *verts) 335 | { 336 | float xprod = -1; 337 | while (xprod < 0.0f) 338 | { 339 | for (unsigned int i = 0; i < 3; ++i) 340 | { 341 | verts[i * 4 + 0] = gRndUniform(gRnd) * 2.0f - 1.0f; 342 | verts[i * 4 + 1] = gRndUniform(gRnd) * 2.0f - 1.0f; 343 | verts[i * 4 + 2] = 0.0f; 344 | verts[i * 4 + 3] = 1.0f; 345 | } 346 | 347 | // Test if triangle is front facing (ccw winded) 348 | float v0x = verts[4] - verts[0]; 349 | float v0y = verts[5] - verts[1]; 350 | float v1x = verts[8] - verts[0]; 351 | float v1y = verts[9] - verts[1]; 352 | xprod = (v0x * v1y) - (v0y * v1x); 353 | } 354 | } 355 | 356 | 357 | //////////////////////////////////////////////////////////////////////////////////////// 358 | // Simple Command Line parser 359 | //////////////////////////////////////////////////////////////////////////////////////// 360 | 361 | char* getCmdOption(char ** begin, char ** end, const std::string & option) 362 | { 363 | char ** itr = std::find(begin, end, option); 364 | if (itr != end && ++itr != end) 365 | { 366 | return *itr; 367 | } 368 | return 0; 369 | } 370 | 371 | bool cmdOptionExists(char** begin, char** end, const std::string& option) 372 | { 373 | return std::find(begin, end, option) != end; 374 | } 375 | //////////////////////////////////////////////////////////////////////////////////////// 376 | // Main: Validate a large number of randomized triangles 377 | //////////////////////////////////////////////////////////////////////////////////////// 378 | 379 | int main(int argc, char* argv[]) 380 | { 381 | const int width = 2048, height = 2048; 382 | 383 | // Flush denorms to zero to avoid performance issues with small values 384 | _mm_setcsr(_mm_getcsr() | 0x8040); 385 | 386 | if (cmdOptionExists(argv, argv + argc, "-h")) 387 | { 388 | printf("-Help\n"); 389 | printf("ValidateTest [-a ]\n"); 390 | exit(0); 391 | } 392 | 393 | char * pOptions = getCmdOption(argv, argv + argc, "-a"); 394 | 395 | // Initialize directx 396 | if (pOptions) 397 | { 398 | if (!strncmp(pOptions, "WARP", strlen(pOptions))) 399 | { 400 | printf("Testing against WARP\n"); 401 | InitD3D(width, height, D3D_DRIVER_TYPE_WARP); 402 | } 403 | 404 | if (!strncmp(pOptions, "REF", strlen(pOptions))) 405 | { 406 | printf("Testing against REF\n"); 407 | InitD3D(width, height, D3D_DRIVER_TYPE_REFERENCE); 408 | } 409 | 410 | } 411 | else 412 | { 413 | printf("Testing against Hardware\n"); 414 | InitD3D(width, height); 415 | } 416 | 417 | 418 | MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(); 419 | 420 | //////////////////////////////////////////////////////////////////////////////////////// 421 | // Setup and state related code 422 | //////////////////////////////////////////////////////////////////////////////////////// 423 | 424 | 425 | // Setup a rendertarget with near clip plane at w = 1.0 426 | moc->SetResolution(width, height); 427 | moc->SetNearClipPlane(1.0f); 428 | 429 | float verts[3*4]; 430 | static const int nTriangles = 1000; 431 | int nPassed = 0; 432 | for (int i = 0; i < nTriangles; ++i) 433 | { 434 | RandomTriangle(verts); 435 | bool pass = D3DValidateTriangle(verts, moc); 436 | if (pass) 437 | nPassed++; 438 | else 439 | printf("Testing triangle %d... FAILED\n", i); 440 | 441 | if (i % 100 == 0) 442 | printf("Testing triangle %d\n", i); 443 | } 444 | printf("%d / %d triangles passed\n", nPassed, nTriangles); 445 | 446 | int numThreads = std::thread::hardware_concurrency() - 1; 447 | printf("\n\nTesting Masked multi threaded code path (using binning), %d threads\n", numThreads); 448 | printf("----\n"); 449 | CullingThreadpool ctp(numThreads, 2, numThreads); 450 | ctp.SetBuffer(moc); 451 | ctp.WakeThreads(); 452 | 453 | unsigned int mocwidth, mocheight; 454 | moc->GetResolution(mocwidth, mocheight); 455 | // Rest Pass rate 456 | nPassed = 0; 457 | for (int i = 0; i < nTriangles; ++i) 458 | { 459 | RandomTriangle(verts); 460 | bool pass = D3DValidateTriangleThreaded(verts, mocwidth, mocheight, &ctp); 461 | if (pass) 462 | nPassed++; 463 | else 464 | printf("Testing triangle %d... FAILED\n", i); 465 | 466 | if (i % 100 == 0) 467 | printf("Testing triangle %d\n", i); 468 | } 469 | printf("%d / %d triangles passed\n", nPassed, nTriangles); 470 | ctp.SuspendThreads(); 471 | } 472 | -------------------------------------------------------------------------------- /D3DValidate/D3DValidate.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "D3DValidate", "D3DValidate.vcxproj", "{02474EA7-2575-4AE7-A86C-6125EE7D1F08}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_LLVM|x64 = Debug_LLVM|x64 11 | Debug|x64 = Debug|x64 12 | Release_LLVM|x64 = Release_LLVM|x64 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 17 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 18 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug|x64.ActiveCfg = Debug|x64 19 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Debug|x64.Build.0 = Debug|x64 20 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 21 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 22 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release|x64.ActiveCfg = Release|x64 23 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /D3DValidate/D3DValidate.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | Debug_LLVM 14 | x64 15 | 16 | 17 | Release_LLVM 18 | x64 19 | 20 | 21 | 22 | {02474EA7-2575-4AE7-A86C-6125EE7D1F08} 23 | Win32Proj 24 | D3DValidate 25 | 10.0.16299.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v141 32 | Unicode 33 | 34 | 35 | Application 36 | true 37 | LLVM-vs2014 38 | Unicode 39 | 40 | 41 | Application 42 | false 43 | v141 44 | Unicode 45 | 46 | 47 | Application 48 | false 49 | LLVM-vs2014 50 | true 51 | Unicode 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | true 73 | 74 | 75 | true 76 | 77 | 78 | false 79 | 80 | 81 | false 82 | 83 | 84 | 85 | 86 | 87 | Level3 88 | Disabled 89 | PRECISE_COVERAGE=1;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Console 93 | true 94 | 95 | 96 | 97 | 98 | 99 | 100 | Level3 101 | Disabled 102 | PRECISE_COVERAGE=1;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 103 | -msse4.1 %(AdditionalOptions) 104 | 105 | 106 | Console 107 | true 108 | 109 | 110 | 111 | 112 | Level3 113 | 114 | 115 | MaxSpeed 116 | true 117 | true 118 | PRECISE_COVERAGE=1;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 119 | 120 | 121 | Console 122 | true 123 | true 124 | true 125 | 126 | 127 | 128 | 129 | Level3 130 | 131 | 132 | MaxSpeed 133 | true 134 | true 135 | PRECISE_COVERAGE=1;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 136 | -msse4.1 %(AdditionalOptions) 137 | 138 | 139 | Console 140 | true 141 | true 142 | true 143 | 144 | 145 | 146 | 147 | 148 | 149 | AdvancedVectorExtensions2 150 | AdvancedVectorExtensions2 151 | AdvancedVectorExtensions2 152 | AdvancedVectorExtensions2 153 | AdvancedVectorExtensions2 154 | AdvancedVectorExtensions2 155 | 156 | 157 | AdvancedVectorExtensions2 158 | AdvancedVectorExtensions2 159 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 160 | false 161 | AdvancedVectorExtensions2 162 | AdvancedVectorExtensions2 163 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /D3DValidate/D3DValidate.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | MaskedOcclusionCullingLibrary 7 | 8 | 9 | MaskedOcclusionCullingLibrary 10 | 11 | 12 | MaskedOcclusionCullingLibrary 13 | 14 | 15 | MaskedOcclusionCullingLibrary 16 | 17 | 18 | 19 | 20 | MaskedOcclusionCullingLibrary 21 | 22 | 23 | MaskedOcclusionCullingLibrary 24 | 25 | 26 | 27 | 28 | {d8c0b3c7-550e-44c2-ac12-708a4d072746} 29 | 30 | 31 | 32 | 33 | MaskedOcclusionCullingLibrary 34 | 35 | 36 | MaskedOcclusionCullingLibrary 37 | 38 | 39 | -------------------------------------------------------------------------------- /Example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required( VERSION 2.6 ) 2 | project( Example ) 3 | set(CMAKE_SUPPRESS_REGENERATION true) 4 | set(CMAKE_CONFIGURATION_TYPES Debug Release) 5 | 6 | # 7 | # Compile masked occlusion culling library 8 | # 9 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/.. ./MaskedOcclusionCulling) 10 | 11 | # 12 | # Set compiler flags 13 | # 14 | if(MSVC) 15 | set(CMAKE_CXX_FLAGS "-std=c++11") 16 | else() 17 | set(CMAKE_CXX_FLAGS "-std=c++11 -m64") 18 | endif() 19 | 20 | # 21 | # Build executable 22 | # 23 | add_executable(Example ExampleMain.cpp) 24 | 25 | # 26 | # Link with the occlusion culling library 27 | # 28 | target_link_libraries (Example LINK_PUBLIC MaskedOcclusionCulling) -------------------------------------------------------------------------------- /Example/Example.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Example", "Example.vcxproj", "{0A343471-42A8-4C29-AD98-9A976C514336}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_LLVM|x64 = Debug_LLVM|x64 11 | Debug|x64 = Debug|x64 12 | Release_LLVM|x64 = Release_LLVM|x64 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 17 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 18 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug|x64.ActiveCfg = Debug|x64 19 | {0A343471-42A8-4C29-AD98-9A976C514336}.Debug|x64.Build.0 = Debug|x64 20 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 21 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 22 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release|x64.ActiveCfg = Release|x64 23 | {0A343471-42A8-4C29-AD98-9A976C514336}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /Example/Example.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Debug_LLVM 10 | x64 11 | 12 | 13 | Release 14 | x64 15 | 16 | 17 | Release_LLVM 18 | x64 19 | 20 | 21 | 22 | {0A343471-42A8-4C29-AD98-9A976C514336} 23 | Win32Proj 24 | Example 25 | 10.0.16299.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v141 32 | Unicode 33 | 34 | 35 | Application 36 | true 37 | LLVM-vs2014 38 | Unicode 39 | 40 | 41 | Application 42 | false 43 | v141 44 | true 45 | Unicode 46 | 47 | 48 | Application 49 | false 50 | LLVM-vs2014 51 | true 52 | Unicode 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | true 74 | $(VC_IncludePath);$(WindowsSDK_IncludePath); 75 | 76 | 77 | true 78 | $(VC_IncludePath);$(WindowsSDK_IncludePath); 79 | 80 | 81 | false 82 | $(VC_IncludePath);$(WindowsSDK_IncludePath); 83 | 84 | 85 | false 86 | $(VC_IncludePath);$(WindowsSDK_IncludePath); 87 | 88 | 89 | 90 | 91 | 92 | Level3 93 | Disabled 94 | _CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 95 | NotSet 96 | false 97 | 98 | 99 | Console 100 | true 101 | 102 | 103 | 104 | 105 | 106 | 107 | Level3 108 | Disabled 109 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 110 | NotSet 111 | false 112 | -msse4.1 %(AdditionalOptions) 113 | 114 | 115 | Console 116 | true 117 | 118 | 119 | 120 | 121 | Level3 122 | 123 | 124 | MaxSpeed 125 | true 126 | true 127 | _CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 128 | NotSet 129 | false 130 | 131 | 132 | Console 133 | true 134 | true 135 | DebugFastLink 136 | %(AdditionalDependencies) 137 | NotSet 138 | 139 | 140 | 141 | 142 | Level3 143 | 144 | 145 | MinSpace 146 | false 147 | true 148 | _CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | NotSet 150 | false 151 | -msse4.1 %(AdditionalOptions) 152 | 153 | 154 | Console 155 | true 156 | true 157 | DebugFastLink 158 | %(AdditionalDependencies) 159 | 160 | 161 | 162 | 163 | 164 | NotSet 165 | NotSet 166 | 167 | 168 | AdvancedVectorExtensions2 169 | AdvancedVectorExtensions2 170 | AdvancedVectorExtensions2 171 | AdvancedVectorExtensions2 172 | 173 | 174 | AdvancedVectorExtensions2 175 | AdvancedVectorExtensions2 176 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 177 | AdvancedVectorExtensions2 178 | AdvancedVectorExtensions2 179 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 180 | 181 | 182 | NotSet 183 | NotSet 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /Example/Example.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | MaskedOcclusionCullingLibrary 7 | 8 | 9 | MaskedOcclusionCullingLibrary 10 | 11 | 12 | MaskedOcclusionCullingLibrary 13 | 14 | 15 | MaskedOcclusionCullingLibrary 16 | 17 | 18 | 19 | 20 | MaskedOcclusionCullingLibrary 21 | 22 | 23 | MaskedOcclusionCullingLibrary 24 | 25 | 26 | 27 | 28 | {e1b85872-c29a-4dd3-b9db-e4ff2dedded9} 29 | 30 | 31 | 32 | 33 | MaskedOcclusionCullingLibrary 34 | 35 | 36 | MaskedOcclusionCullingLibrary 37 | 38 | 39 | -------------------------------------------------------------------------------- /Example/ExampleMain.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #ifndef _CRT_SECURE_NO_WARNINGS 17 | #define _CRT_SECURE_NO_WARNINGS 18 | #endif 19 | #include 20 | #include 21 | #include 22 | #include 23 | #ifdef _WIN32 24 | #include 25 | #else 26 | #include 27 | #endif 28 | 29 | #include "../MaskedOcclusionCulling.h" 30 | 31 | //////////////////////////////////////////////////////////////////////////////////////// 32 | // Image utility functions, minimal BMP writer and depth buffer tone mapping 33 | //////////////////////////////////////////////////////////////////////////////////////// 34 | 35 | static void WriteBMP(const char *filename, const unsigned char *data, int w, int h) 36 | { 37 | short header[] = { 0x4D42, 0, 0, 0, 0, 26, 0, 12, 0, (short)w, (short)h, 1, 24 }; 38 | FILE *f = fopen(filename, "wb"); 39 | fwrite(header, 1, sizeof(header), f); 40 | #if USE_D3D == 1 41 | // Flip image because Y axis of Direct3D points in the opposite direction of bmp. If the library 42 | // is configured for OpenGL (USE_D3D 0) then the Y axes would match and this wouldn't be required. 43 | for (int y = 0; y < h; ++y) 44 | fwrite(&data[(h - y - 1) * w * 3], 1, w * 3, f); 45 | #else 46 | fwrite(data, 1, w * h * 3, f); 47 | #endif 48 | fclose(f); 49 | } 50 | 51 | static void TonemapDepth(float *depth, unsigned char *image, int w, int h) 52 | { 53 | // Find min/max w coordinate (discard cleared pixels) 54 | float minW = FLT_MAX, maxW = 0.0f; 55 | for (int i = 0; i < w*h; ++i) 56 | { 57 | if (depth[i] > 0.0f) 58 | { 59 | minW = std::min(minW, depth[i]); 60 | maxW = std::max(maxW, depth[i]); 61 | } 62 | } 63 | 64 | // Tonemap depth values 65 | for (int i = 0; i < w*h; ++i) 66 | { 67 | int intensity = 0; 68 | if (depth[i] > 0) 69 | intensity = (unsigned char)(223.0*(depth[i] - minW) / (maxW - minW) + 32.0); 70 | 71 | image[i * 3 + 0] = intensity; 72 | image[i * 3 + 1] = intensity; 73 | image[i * 3 + 2] = intensity; 74 | } 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////////////// 78 | // Tutorial example code 79 | //////////////////////////////////////////////////////////////////////////////////////// 80 | 81 | int main(int argc, char* argv[]) 82 | { 83 | // Flush denorms to zero to avoid performance issues with small values 84 | _mm_setcsr(_mm_getcsr() | 0x8040); 85 | 86 | MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(); 87 | 88 | //////////////////////////////////////////////////////////////////////////////////////// 89 | // Print which version (instruction set) is being used 90 | //////////////////////////////////////////////////////////////////////////////////////// 91 | 92 | MaskedOcclusionCulling::Implementation implementation = moc->GetImplementation(); 93 | switch (implementation) { 94 | case MaskedOcclusionCulling::SSE2: printf("Using SSE2 version\n"); break; 95 | case MaskedOcclusionCulling::SSE41: printf("Using SSE41 version\n"); break; 96 | case MaskedOcclusionCulling::AVX2: printf("Using AVX2 version\n"); break; 97 | case MaskedOcclusionCulling::AVX512: printf("Using AVX-512 version\n"); break; 98 | } 99 | 100 | //////////////////////////////////////////////////////////////////////////////////////// 101 | // Setup and state related code 102 | //////////////////////////////////////////////////////////////////////////////////////// 103 | 104 | // Setup a 1920 x 1080 rendertarget with near clip plane at w = 1.0 105 | const int width = 1920, height = 1080; 106 | moc->SetResolution(width, height); 107 | moc->SetNearClipPlane(1.0f); 108 | 109 | // Clear the depth buffer 110 | moc->ClearBuffer(); 111 | 112 | //////////////////////////////////////////////////////////////////////////////////////// 113 | // Render some occluders 114 | //////////////////////////////////////////////////////////////////////////////////////// 115 | struct ClipspaceVertex { float x, y, z, w; }; 116 | 117 | // A triangle that intersects the view frustum 118 | ClipspaceVertex triVerts[] = { { 5, 0, 0, 10 }, { 30, 0, 0, 20 }, { 10, 50, 0, 40 } }; 119 | unsigned int triIndices[] = { 0, 1, 2 }; 120 | 121 | // Render the triangle 122 | moc->RenderTriangles((float*)triVerts, triIndices, 1); 123 | 124 | // A clockwise winded (backfacing) triangle 125 | ClipspaceVertex cwTriVerts[] = { { 7, -7, 0, 20 },{ 7.5, -7, 0, 20 },{ 7, -7.5, 0, 20 } }; 126 | unsigned int cwTriIndices[] = { 0, 1, 2 }; 127 | 128 | // Render with counter-clockwise backface culling, the triangle is rendered 129 | moc->RenderTriangles((float*)cwTriVerts, cwTriIndices, 1, nullptr, MaskedOcclusionCulling::BACKFACE_CCW); 130 | 131 | // A quad completely within the view frustum 132 | ClipspaceVertex quadVerts[] = { { -150, -150, 0, 200 }, { -10, -65, 0, 75 }, { 0, 0, 0, 20 }, { -40, 10, 0, 50 } }; 133 | unsigned int quadIndices[] = { 0, 1, 2, 0, 2, 3 }; 134 | 135 | // Render the quad. As an optimization, indicate that clipping is not required as it is 136 | // completely inside the view frustum 137 | moc->RenderTriangles((float*)quadVerts, quadIndices, 2, nullptr, MaskedOcclusionCulling::BACKFACE_CW, MaskedOcclusionCulling::CLIP_PLANE_NONE); 138 | 139 | // A triangle specified on struct of arrays (SoA) form 140 | float SoAVerts[] = { 141 | 10, 10, 7, // x-coordinates 142 | -10, -7, -10, // y-coordinates 143 | 10, 10, 10 // w-coordinates 144 | }; 145 | 146 | // Set vertex layout (stride, y offset, w offset) 147 | MaskedOcclusionCulling::VertexLayout SoAVertexLayout(sizeof(float), 3 * sizeof(float), 6 * sizeof(float)); 148 | 149 | // Render triangle with SoA layout 150 | moc->RenderTriangles((float*)SoAVerts, triIndices, 1, nullptr, MaskedOcclusionCulling::BACKFACE_CW, MaskedOcclusionCulling::CLIP_PLANE_ALL, SoAVertexLayout); 151 | 152 | 153 | //////////////////////////////////////////////////////////////////////////////////////// 154 | // Perform some occlusion queries 155 | //////////////////////////////////////////////////////////////////////////////////////// 156 | 157 | // A triangle, partly overlapped by the quad 158 | ClipspaceVertex oqTriVerts[] = { { 0, 50, 0, 200 }, { -60, -60, 0, 200 }, { 20, -40, 0, 200 } }; 159 | unsigned int oqTriIndices[] = { 0, 1, 2 }; 160 | 161 | // Perform an occlusion query. The triangle is visible and the query should return VISIBLE 162 | MaskedOcclusionCulling::CullingResult result; 163 | result = moc->TestTriangles((float*)oqTriVerts, oqTriIndices, 1); 164 | if (result == MaskedOcclusionCulling::VISIBLE) 165 | printf("Tested triangle is VISIBLE\n"); 166 | else if (result == MaskedOcclusionCulling::OCCLUDED) 167 | printf("Tested triangle is OCCLUDED\n"); 168 | else if (result == MaskedOcclusionCulling::VIEW_CULLED) 169 | printf("Tested triangle is outside view frustum\n"); 170 | 171 | // Render the occlusion query triangle to show its position 172 | moc->RenderTriangles((float*)oqTriVerts, oqTriIndices, 1); 173 | 174 | 175 | // Perform an occlusion query testing if a rectangle is visible. The rectangle is completely 176 | // behind the previously drawn quad, so the query should indicate that it's occluded 177 | result = moc->TestRect(-0.6f, -0.6f, -0.4f, -0.4f, 100); 178 | if (result == MaskedOcclusionCulling::VISIBLE) 179 | printf("Tested rect is VISIBLE\n"); 180 | else if (result == MaskedOcclusionCulling::OCCLUDED) 181 | printf("Tested rect is OCCLUDED\n"); 182 | else if (result == MaskedOcclusionCulling::VIEW_CULLED) 183 | printf("Tested rect is outside view frustum\n"); 184 | 185 | // Compute a per pixel depth buffer from the hierarchical depth buffer, used for visualization. 186 | float *perPixelZBuffer = new float[width * height]; 187 | moc->ComputePixelDepthBuffer(perPixelZBuffer, false); 188 | 189 | // Tonemap the image 190 | unsigned char *image = new unsigned char[width * height * 3]; 191 | TonemapDepth(perPixelZBuffer, image, width, height); 192 | WriteBMP("image.bmp", image, width, height); 193 | delete[] image; 194 | 195 | // Destroy occlusion culling object and free hierarchical z-buffer 196 | MaskedOcclusionCulling::Destroy(moc); 197 | 198 | } 199 | -------------------------------------------------------------------------------- /FillrateTest/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required( VERSION 2.6 ) 2 | project( FillrateTest ) 3 | set(CMAKE_SUPPRESS_REGENERATION true) 4 | set(CMAKE_CONFIGURATION_TYPES Debug Release) 5 | 6 | # 7 | # Compile masked occlusion culling library 8 | # 9 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/.. ./MaskedOcclusionCulling) 10 | 11 | # 12 | # Set compiler flags 13 | # 14 | if(MSVC) 15 | set(CMAKE_CXX_FLAGS "-std=c++11") 16 | else() 17 | set(CMAKE_CXX_FLAGS "-std=c++11 -m64") 18 | endif() 19 | 20 | # 21 | # Build executable 22 | # 23 | add_executable(FillrateTest FillrateTest.cpp) 24 | 25 | # 26 | # Link with the occlusion culling library 27 | # 28 | target_link_libraries (FillrateTest LINK_PUBLIC MaskedOcclusionCulling) 29 | -------------------------------------------------------------------------------- /FillrateTest/FillrateTest.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #ifdef _WIN32 25 | #include 26 | #else 27 | #include 28 | #endif 29 | 30 | #include "../CullingThreadpool.h" 31 | #include "../MaskedOcclusionCulling.h" 32 | 33 | std::mt19937 gRnd; 34 | std::uniform_real_distribution gRndUniform(0, 1); 35 | 36 | #define F_PI 3.14159265358979323846f 37 | 38 | // if 1, makes some of the random triangles slightly out of screen and enables clipping 39 | #define TEST_CLIPPING 0 40 | 41 | #ifdef _WIN32 42 | //////////////////////////////////////////////////////////////////////////////////////// 43 | // DX 11 setup & resource creation code 44 | //////////////////////////////////////////////////////////////////////////////////////// 45 | 46 | #define NOMINMAX 47 | #include 48 | #include 49 | #include 50 | 51 | #pragma comment (lib, "d3d11.lib") 52 | #pragma comment (lib, "d3dcompiler.lib") 53 | 54 | ID3D11Texture2D *textureZ; 55 | ID3D11Texture2D *textureCol, *staging; 56 | ID3D11DepthStencilView *textureDSV; 57 | ID3D11RenderTargetView *textureRTV; 58 | ID3D11Device *device; 59 | ID3D11DeviceContext *context; 60 | 61 | ID3D11RasterizerState *rastState; 62 | ID3D11DepthStencilState *DSState; 63 | ID3D11InputLayout *layout; 64 | ID3D11VertexShader *VS; 65 | ID3D11PixelShader *PS; 66 | std::vector vBuffers; 67 | ID3D11Query *endQuery; 68 | 69 | #define D3DVERIFY(X) if (X != S_OK) exit(1); 70 | 71 | void InitD3D(unsigned int width, unsigned int height) 72 | { 73 | const char *shader = 74 | "float4 VShader(float4 position : POSITION) : SV_POSITION { return position; }" 75 | "float4 PShader(float4 position : SV_POSITION) : SV_TARGET { return 1.0f - position.z; }"; 76 | 77 | D3D_FEATURE_LEVEL fLevel; 78 | D3DVERIFY(D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &device, &fLevel, &context)); 79 | 80 | D3D11_TEXTURE2D_DESC tDesc; 81 | tDesc.Width = width; 82 | tDesc.Height = height; 83 | tDesc.MipLevels = tDesc.ArraySize = 1; 84 | tDesc.SampleDesc.Count = 1; 85 | tDesc.SampleDesc.Quality = 0; 86 | tDesc.CPUAccessFlags = 0; 87 | tDesc.MiscFlags = 0; 88 | tDesc.Usage = D3D11_USAGE_DEFAULT; 89 | 90 | tDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 91 | tDesc.BindFlags = D3D11_BIND_RENDER_TARGET; 92 | D3DVERIFY(device->CreateTexture2D(&tDesc, nullptr, &textureCol)); 93 | 94 | tDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; 95 | tDesc.BindFlags = D3D11_BIND_DEPTH_STENCIL; 96 | D3DVERIFY(device->CreateTexture2D(&tDesc, nullptr, &textureZ)); 97 | 98 | tDesc.Usage = D3D11_USAGE_STAGING; 99 | tDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 100 | tDesc.BindFlags = 0; 101 | tDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; 102 | D3DVERIFY(device->CreateTexture2D(&tDesc, nullptr, &staging)); 103 | 104 | D3D11_DEPTH_STENCIL_VIEW_DESC dsvDesc; 105 | dsvDesc.Flags = 0; 106 | dsvDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT; 107 | dsvDesc.ViewDimension = D3D11_DSV_DIMENSION_TEXTURE2D; 108 | dsvDesc.Texture2D.MipSlice = 0; 109 | D3DVERIFY(device->CreateDepthStencilView(textureZ, &dsvDesc, &textureDSV)); 110 | 111 | D3D11_RENDER_TARGET_VIEW_DESC rtvDesc; 112 | rtvDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; 113 | rtvDesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; 114 | rtvDesc.Texture2D.MipSlice = 0; 115 | D3DVERIFY(device->CreateRenderTargetView(textureCol, &rtvDesc, &textureRTV)); 116 | 117 | // Set the viewport 118 | D3D11_VIEWPORT viewport; 119 | viewport.TopLeftX = 0; 120 | viewport.TopLeftY = 0; 121 | viewport.Width = (float)width; 122 | viewport.Height = (float)height; 123 | viewport.MinDepth = 0; 124 | viewport.MaxDepth = 1; 125 | context->RSSetViewports(1, &viewport); 126 | 127 | // load and compile the two shaders 128 | ID3D10Blob *VSBlob, *PSBlob; 129 | D3DVERIFY(D3DCompile(shader, strlen(shader), "shader", nullptr, nullptr, "VShader", "vs_5_0", 0, 0, &VSBlob, nullptr)); 130 | D3DVERIFY(D3DCompile(shader, strlen(shader), "shader", nullptr, nullptr, "PShader", "ps_5_0", 0, 0, &PSBlob, nullptr)); 131 | 132 | // encapsulate both shaders into shader objects 133 | D3DVERIFY(device->CreateVertexShader(VSBlob->GetBufferPointer(), VSBlob->GetBufferSize(), NULL, &VS)); 134 | D3DVERIFY(device->CreatePixelShader(PSBlob->GetBufferPointer(), PSBlob->GetBufferSize(), NULL, &PS)); 135 | 136 | // set the shader objects 137 | context->VSSetShader(VS, 0, 0); 138 | context->PSSetShader(PS, 0, 0); 139 | 140 | // create the input layout object 141 | D3D11_INPUT_ELEMENT_DESC ied[] = 142 | { 143 | { "POSITION", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 }, 144 | }; 145 | 146 | D3DVERIFY(device->CreateInputLayout(ied, 1, VSBlob->GetBufferPointer(), VSBlob->GetBufferSize(), &layout)); 147 | context->IASetInputLayout(layout); 148 | 149 | D3D11_RASTERIZER_DESC rDesc; 150 | rDesc.FillMode = D3D11_FILL_SOLID; 151 | rDesc.CullMode = D3D11_CULL_NONE; 152 | rDesc.FrontCounterClockwise = false; 153 | rDesc.DepthBias = 0; 154 | rDesc.DepthBiasClamp = 0; 155 | rDesc.SlopeScaledDepthBias = 0; 156 | rDesc.DepthClipEnable = false; 157 | rDesc.ScissorEnable = false; 158 | rDesc.MultisampleEnable = false; 159 | rDesc.AntialiasedLineEnable = false; 160 | device->CreateRasterizerState(&rDesc, &rastState); 161 | context->RSSetState(rastState); 162 | 163 | D3D11_DEPTH_STENCIL_DESC dsDesc; 164 | ZeroMemory(&dsDesc, sizeof(D3D11_DEPTH_STENCIL_DESC)); 165 | dsDesc.DepthEnable = true; 166 | dsDesc.StencilEnable = false; 167 | dsDesc.DepthFunc = D3D11_COMPARISON_LESS_EQUAL; 168 | dsDesc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL; 169 | device->CreateDepthStencilState(&dsDesc, &DSState); 170 | context->OMSetDepthStencilState(DSState, 0); 171 | 172 | D3D11_QUERY_DESC qDesc; 173 | qDesc.Query = D3D11_QUERY_EVENT; 174 | qDesc.MiscFlags = 0; 175 | D3DVERIFY(device->CreateQuery(&qDesc, &endQuery)); 176 | } 177 | 178 | void D3DAddTriangles(float *verts, int nTris) 179 | { 180 | ID3D11Buffer *buf = nullptr; 181 | 182 | D3D11_SUBRESOURCE_DATA iData; 183 | iData.pSysMem = verts; 184 | iData.SysMemPitch = 0; 185 | iData.SysMemSlicePitch = 0; 186 | 187 | D3D11_BUFFER_DESC bDesc; 188 | bDesc.Usage = D3D11_USAGE_DEFAULT; 189 | bDesc.ByteWidth = nTris * 3 * 4 * sizeof(float); 190 | bDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; 191 | bDesc.CPUAccessFlags = bDesc.MiscFlags = bDesc.StructureByteStride = 0; 192 | D3DVERIFY(device->CreateBuffer(&bDesc, &iData, &buf)); 193 | vBuffers.push_back(buf); 194 | } 195 | 196 | double BenchmarkTrianglesD3D(ID3D11Buffer *buf, int numTriangles, bool color) 197 | { 198 | // set the render target as the back buffer 199 | if (color) 200 | context->OMSetRenderTargets(1, &textureRTV, textureDSV); 201 | else 202 | context->OMSetRenderTargets(0, nullptr, textureDSV); 203 | 204 | // clear the back buffer to a deep blue 205 | float clearColor[4] = { 0.0f, 1.0f, 0.0f, 1.0f }; 206 | context->ClearRenderTargetView(textureRTV, clearColor); 207 | context->ClearDepthStencilView(textureDSV, D3D11_CLEAR_DEPTH, 1.0f, 0); 208 | 209 | // Setup primitivelist 210 | UINT stride = sizeof(float) * 4, offset = 0; 211 | context->IASetVertexBuffers(0, 1, &buf, &stride, &offset); 212 | context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); 213 | context->Flush(); 214 | 215 | // Draw triangles 216 | auto before = std::chrono::high_resolution_clock::now(); 217 | context->Begin(endQuery); 218 | context->Draw(numTriangles * 3, 0); 219 | context->End(endQuery); 220 | while (context->GetData(endQuery, nullptr, 0, 0) == S_FALSE) {} 221 | auto after = std::chrono::high_resolution_clock::now(); 222 | 223 | return std::chrono::duration(after - before).count(); 224 | } 225 | 226 | #endif 227 | 228 | //////////////////////////////////////////////////////////////////////////////////////// 229 | // Simple random triangle rasterizer benchmark 230 | //////////////////////////////////////////////////////////////////////////////////////// 231 | 232 | inline float frand() { return (float)rand() / (float)RAND_MAX; } 233 | 234 | void GenerateRandomTriangles(float *verts, unsigned int *triIdxBtF, int nTris, int size, float width, float height) 235 | { 236 | // make some of the triangles slightly out of screen to test clipping paths 237 | #if TEST_CLIPPING 238 | const float clippingMul = 1.1f; 239 | #else 240 | const float clippingMul = 1.0f; 241 | #endif 242 | 243 | for (int idx = 0; idx < nTris; ++idx) 244 | { 245 | triIdxBtF[idx * 3 + 0] = idx * 3 + 0; 246 | triIdxBtF[idx * 3 + 1] = idx * 3 + 1; 247 | triIdxBtF[idx * 3 + 2] = idx * 3 + 2; 248 | 249 | while (true) 250 | { 251 | float vtx[3][3] = { { 0, 0, 1 },{ size * 2 / (float)width, 0, 1 },{ 0, size * 2 / (float)height, 1 } }; 252 | float offset[3] = { gRndUniform(gRnd)*2.0f - 1.0f, gRndUniform(gRnd)*2.0f - 1.0f, 0 }; 253 | float rotation = gRndUniform(gRnd) * 2 * F_PI; 254 | 255 | float myz = (float)(nTris - idx); 256 | bool triOk = true; 257 | float rvtx[3][3]; 258 | for (int i = 0; i < 3; ++i) 259 | { 260 | rvtx[i][0] = cos(rotation)*vtx[i][0] - sin(rotation)*vtx[i][1] + offset[0]; 261 | rvtx[i][1] = sin(rotation)*vtx[i][0] + cos(rotation)*vtx[i][1] + offset[1]; 262 | 263 | if (rvtx[i][0] < -1.0f || rvtx[i][0] > 1.0f || rvtx[i][1] < -1.0f || rvtx[i][1] > 1.0f) 264 | triOk = false; 265 | 266 | float z = myz / (float)nTris; 267 | 268 | int vtxIdx = idx * 3 + i; 269 | verts[vtxIdx * 4 + 0] = rvtx[i][0] * myz * clippingMul; 270 | verts[vtxIdx * 4 + 1] = rvtx[i][1] * myz * clippingMul; 271 | verts[vtxIdx * 4 + 2] = z * myz; 272 | verts[vtxIdx * 4 + 3] = myz; 273 | } 274 | if (triOk) 275 | break; 276 | } 277 | } 278 | } 279 | 280 | double BenchmarkTriangles(float *verts, unsigned int *tris, int numTriangles, MaskedOcclusionCulling *moc) 281 | { 282 | moc->ClearBuffer(); 283 | 284 | auto before = std::chrono::high_resolution_clock::now(); 285 | #if TEST_CLIPPING 286 | moc->RenderTriangles( verts, tris, numTriangles, nullptr, MaskedOcclusionCulling::BACKFACE_CW, MaskedOcclusionCulling::CLIP_PLANE_ALL ); 287 | #else 288 | moc->RenderTriangles(verts, tris, numTriangles, nullptr, MaskedOcclusionCulling::BACKFACE_CW, MaskedOcclusionCulling::CLIP_PLANE_NONE); 289 | #endif 290 | auto after = std::chrono::high_resolution_clock::now(); 291 | 292 | return std::chrono::duration(after - before).count(); 293 | } 294 | 295 | double BenchmarkTrianglesThreaded(float *verts, unsigned int *tris, int numTriangles, CullingThreadpool *ctp) 296 | { 297 | ctp->ClearBuffer(); 298 | 299 | auto before = std::chrono::high_resolution_clock::now(); 300 | #if TEST_CLIPPING 301 | ctp->RenderTriangles(verts, tris, numTriangles, MaskedOcclusionCulling::BACKFACE_CW, MaskedOcclusionCulling::CLIP_PLANE_ALL); 302 | #else 303 | ctp->RenderTriangles( verts, tris, numTriangles, MaskedOcclusionCulling::BACKFACE_CW, MaskedOcclusionCulling::CLIP_PLANE_NONE ); 304 | #endif 305 | ctp->Flush(); 306 | auto after = std::chrono::high_resolution_clock::now(); 307 | 308 | return std::chrono::duration(after - before).count(); 309 | } 310 | 311 | //////////////////////////////////////////////////////////////////////////////////////// 312 | // Perform basic fillrate benchmarks for GPU and compare with this libray 313 | //////////////////////////////////////////////////////////////////////////////////////// 314 | 315 | int main(int argc, char* argv[]) 316 | { 317 | const int width = 1920, height = 1080; 318 | 319 | // Flush denorms to zero to avoid performance issues with small values 320 | _mm_setcsr(_mm_getcsr() | 0x8040); 321 | 322 | MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(); 323 | 324 | 325 | #ifdef _WIN32 326 | // Initialize directx 327 | InitD3D(width, height); 328 | #endif 329 | 330 | //////////////////////////////////////////////////////////////////////////////////////// 331 | // Setup and state related code 332 | //////////////////////////////////////////////////////////////////////////////////////// 333 | 334 | // Setup a rendertarget with near clip plane at w = 1.0 335 | moc->SetResolution(width, height); 336 | moc->SetNearClipPlane(1.0f); 337 | 338 | //////////////////////////////////////////////////////////////////////////////////////// 339 | // Create randomized triangles for back-to-front and front-to-back rendering 340 | //////////////////////////////////////////////////////////////////////////////////////// 341 | 342 | const int numTriangles[] = { 4096 * 1024, 4096 * 1024, 4096 * 1024, 2048 * 1024, 1024 * 1024, 512 * 1024, 256 * 1024 }; 343 | const int sizes[] = { 10, 25, 50, 75, 100, 250, 500 }; 344 | 345 | int numSizes = sizeof(sizes) / sizeof(int); 346 | 347 | printf("Generating randomized triangles"); 348 | std::vector trisBtF; 349 | std::vector verts; 350 | for (int i = 0; i < numSizes; ++i) 351 | { 352 | float *pVerts = new float[numTriangles[i] * 4 * 3]; 353 | unsigned int *pTrisBtF = new unsigned int[numTriangles[i] * 3]; 354 | GenerateRandomTriangles(pVerts, pTrisBtF, numTriangles[i], sizes[i], (float)width, (float)height); 355 | verts.push_back(pVerts); 356 | trisBtF.push_back(pTrisBtF); 357 | #ifdef _WIN32 358 | D3DAddTriangles(pVerts, numTriangles[i]); 359 | #endif 360 | printf("."); 361 | } 362 | 363 | //////////////////////////////////////////////////////////////////////////////////////// 364 | // Perform benchmarks 365 | //////////////////////////////////////////////////////////////////////////////////////// 366 | #ifdef _WIN32 367 | printf("\nD3D Z only\n"); 368 | printf("----\n"); 369 | for (int i = 0; i < numSizes; ++i) 370 | { 371 | int size = sizes[i]; 372 | double t = BenchmarkTrianglesD3D(vBuffers[i], numTriangles[i], false); 373 | double GPixelsPerSecond = (double)numTriangles[i] * size*size / (2.0 * 1e9 * t); 374 | double MTrisPerSecond = (double)numTriangles[i] / (1e6 * t); 375 | printf("Tri: %3dx%3d - Time: %7.2f ms, MTris/s: %6.2f GPixels/s: %5.2f \n", size, size, t * 1000.0f, MTrisPerSecond, GPixelsPerSecond); 376 | } 377 | #endif 378 | 379 | printf( "\nInstruction set:" ); 380 | switch( moc->GetImplementation( ) ) 381 | { 382 | case MaskedOcclusionCulling::SSE2: 383 | printf( "SSE2\n" ); 384 | break; 385 | case MaskedOcclusionCulling::SSE41: 386 | printf( "SSE41\n" ); 387 | break; 388 | case MaskedOcclusionCulling::AVX2: 389 | printf( "AVX2\n" ); 390 | break; 391 | case MaskedOcclusionCulling::AVX512: 392 | printf( "AVX512\n\n" ); 393 | break; 394 | default: 395 | break; 396 | 397 | }; 398 | 399 | #if 0 && MOC_RECORDER_ENABLE 400 | moc->RecorderStart( "..\\FrameRecorderPlayer\\FillrateTest.mocrec" ); 401 | #endif 402 | 403 | printf("\n\nMasked single threaded\n"); 404 | printf("----\n"); 405 | for (int i = 0; i < numSizes; ++i) 406 | { 407 | int size = sizes[i]; 408 | double t = BenchmarkTriangles(verts[i], trisBtF[i], numTriangles[i], moc); 409 | double GPixelsPerSecond = (double)numTriangles[i] * size*size / (2.0 * 1e9 * t); 410 | double MTrisPerSecond = (double)numTriangles[i] / (1e6 * t); 411 | printf("Tri: %3dx%3d - Time: %7.2f ms, MTris/s: %6.2f GPixels/s: %5.2f \n", size, size, t * 1000.0f, MTrisPerSecond, GPixelsPerSecond); 412 | } 413 | 414 | #if 0 && MOC_RECORDER_ENABLE 415 | moc->RecorderStop( ); 416 | #endif 417 | 418 | int numThreads = std::thread::hardware_concurrency() - 1; 419 | printf("\n\nMasked multi threaded (%d threads)\n", numThreads); 420 | printf("----\n"); 421 | CullingThreadpool ctp(numThreads, 2, numThreads); 422 | ctp.SetBuffer(moc); 423 | ctp.WakeThreads(); 424 | for (int i = 0; i < numSizes; ++i) 425 | { 426 | int size = sizes[i]; 427 | double t = BenchmarkTrianglesThreaded(verts[i], trisBtF[i], numTriangles[i], &ctp); 428 | double GPixelsPerSecond = (double)numTriangles[i] * size*size / (2.0 * 1e9 * t); 429 | double MTrisPerSecond = (double)numTriangles[i] / (1e6 * t); 430 | printf("Tri: %3dx%3d - Time: %7.2f ms, MTris/s: %6.2f GPixels/s: %5.2f \n", size, size, t * 1000.0f, MTrisPerSecond, GPixelsPerSecond); 431 | } 432 | ctp.SuspendThreads(); 433 | } -------------------------------------------------------------------------------- /FillrateTest/FillrateTest.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FillrateTest", "FillrateTest.vcxproj", "{C4229C47-7922-417C-9931-348CA8750D53}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_LLVM|x64 = Debug_LLVM|x64 11 | Debug|x64 = Debug|x64 12 | Release_LLVM|x64 = Release_LLVM|x64 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 17 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 18 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug|x64.ActiveCfg = Debug|x64 19 | {C4229C47-7922-417C-9931-348CA8750D53}.Debug|x64.Build.0 = Debug|x64 20 | {C4229C47-7922-417C-9931-348CA8750D53}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 21 | {C4229C47-7922-417C-9931-348CA8750D53}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 22 | {C4229C47-7922-417C-9931-348CA8750D53}.Release|x64.ActiveCfg = Release|x64 23 | {C4229C47-7922-417C-9931-348CA8750D53}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /FillrateTest/FillrateTest.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug_LLVM 6 | x64 7 | 8 | 9 | Release_LLVM 10 | x64 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {C4229C47-7922-417C-9931-348CA8750D53} 23 | Win32Proj 24 | FillrateTest 25 | 10.0.16299.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v141 32 | Unicode 33 | 34 | 35 | Application 36 | true 37 | LLVM-vs2014 38 | Unicode 39 | 40 | 41 | Application 42 | false 43 | v141 44 | Unicode 45 | 46 | 47 | Application 48 | false 49 | LLVM-vs2014 50 | true 51 | Unicode 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | true 73 | 74 | 75 | true 76 | 77 | 78 | false 79 | 80 | 81 | false 82 | 83 | 84 | 85 | 86 | 87 | Level3 88 | Disabled 89 | MOC_RECORDER_ENABLE=0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | false 91 | 92 | 93 | Console 94 | true 95 | 96 | 97 | 98 | 99 | 100 | 101 | Level3 102 | Disabled 103 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 104 | -msse4.1 %(AdditionalOptions) 105 | false 106 | 107 | 108 | Console 109 | true 110 | 111 | 112 | 113 | 114 | Level3 115 | 116 | 117 | MaxSpeed 118 | true 119 | true 120 | MOC_RECORDER_ENABLE=0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 121 | false 122 | 123 | 124 | Console 125 | true 126 | true 127 | true 128 | 129 | 130 | 131 | 132 | Level3 133 | 134 | 135 | MaxSpeed 136 | true 137 | true 138 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 139 | -msse4.1 %(AdditionalOptions) 140 | false 141 | 142 | 143 | Console 144 | true 145 | true 146 | true 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | AdvancedVectorExtensions2 155 | AdvancedVectorExtensions2 156 | AdvancedVectorExtensions2 157 | AdvancedVectorExtensions2 158 | false 159 | false 160 | false 161 | false 162 | 163 | 164 | false 165 | false 166 | false 167 | false 168 | AdvancedVectorExtensions2 169 | AdvancedVectorExtensions2 170 | AdvancedVectorExtensions2 171 | -mavx512f -mavx512bw -mavx512dq 172 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 173 | AdvancedVectorExtensions2 174 | false 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | -------------------------------------------------------------------------------- /FillrateTest/FillrateTest.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | MaskedOcclusionCulling 7 | 8 | 9 | MaskedOcclusionCulling 10 | 11 | 12 | MaskedOcclusionCulling 13 | 14 | 15 | MaskedOcclusionCulling 16 | 17 | 18 | MaskedOcclusionCulling 19 | 20 | 21 | 22 | 23 | MaskedOcclusionCulling 24 | 25 | 26 | MaskedOcclusionCulling 27 | 28 | 29 | MaskedOcclusionCulling 30 | 31 | 32 | 33 | 34 | {b848f282-5fc0-4806-b33e-649bedd84f85} 35 | 36 | 37 | 38 | 39 | MaskedOcclusionCulling 40 | 41 | 42 | MaskedOcclusionCulling 43 | 44 | 45 | -------------------------------------------------------------------------------- /FrameRecorder.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #include "FrameRecorder.h" 17 | 18 | #if MOC_RECORDER_ENABLE 19 | #include 20 | #include 21 | 22 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 23 | // Masked occlusion culling 24 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 25 | 26 | bool MaskedOcclusionCulling::RecorderStart( const char * outputFilePath ) const 27 | { 28 | std::lock_guard lock( mRecorderMutex ); 29 | 30 | assert( mRecorder == nullptr ); // forgot to call RecorderStop? 31 | if( mRecorder != nullptr ) 32 | return false; 33 | 34 | #if MOC_RECORDER_USE_STDIO_FILE 35 | FILE * f; 36 | if( fopen_s( &f, outputFilePath, "wb" ) != 0 ) 37 | return false; 38 | #else 39 | std::ofstream outStream( outputFilePath, std::ios::out | std::ios::trunc | std::ios::binary ); 40 | if( !outStream.is_open( ) ) 41 | return false; 42 | #endif 43 | mRecorder = (FrameRecorder *)mAlignedAllocCallback( 64, sizeof( FrameRecorder ) ); 44 | #if MOC_RECORDER_USE_STDIO_FILE 45 | new (mRecorder) FrameRecorder( f, *this ); 46 | #else 47 | new (mRecorder) FrameRecorder( std::move( outStream ), *this ); 48 | #endif 49 | 50 | return true; 51 | } 52 | 53 | void MaskedOcclusionCulling::RecorderStop( ) const 54 | { 55 | std::lock_guard lock( mRecorderMutex ); 56 | 57 | mRecorder->~FrameRecorder(); 58 | mAlignedFreeCallback( mRecorder ); 59 | mRecorder = nullptr; 60 | } 61 | 62 | void MaskedOcclusionCulling::RecordRenderTriangles( const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, ClipPlanes clipPlaneMask, BackfaceWinding bfWinding, const VertexLayout &vtxLayout, CullingResult cullingResult ) 63 | { 64 | std::lock_guard lock( mRecorderMutex ); 65 | if( mRecorder != nullptr ) 66 | mRecorder->RecordRenderTriangles( cullingResult, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout ); 67 | } 68 | 69 | 70 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 71 | // Masked occlusion culling recorder 72 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 73 | 74 | #if MOC_RECORDER_USE_STDIO_FILE 75 | FrameRecorder::FrameRecorder( FILE *& outFile, const MaskedOcclusionCulling & moc ) 76 | #else 77 | FrameRecorder::FrameRecorder( std::ofstream && outStream, const MaskedOcclusionCulling & moc ) : mOutStream( std::move( outStream ) ) 78 | #endif 79 | { 80 | #if MOC_RECORDER_USE_STDIO_FILE 81 | mOutFile = outFile; 82 | outFile = 0; 83 | assert( mOutFile != 0 ); 84 | #else 85 | assert( mOutStream.is_open( ) ); 86 | #endif 87 | 88 | // for file verification purposes 89 | unsigned int fileHeader = 0xA701B600; 90 | Write( &fileHeader, sizeof( fileHeader ) ); 91 | 92 | // save some of the MOC states (we can override them for the playback) 93 | float nearClipPlane = moc.GetNearClipPlane(); 94 | unsigned int width; 95 | unsigned int height; 96 | moc.GetResolution( width, height ); 97 | 98 | Write( &nearClipPlane, sizeof( nearClipPlane ) ); 99 | Write( &width, sizeof( width ) ); 100 | Write( &height, sizeof( height ) ); 101 | } 102 | 103 | FrameRecorder::~FrameRecorder( ) 104 | { 105 | // end of file marker 106 | char footer = 0x7F; 107 | Write( &footer, 1 ); 108 | 109 | #if MOC_RECORDER_USE_STDIO_FILE 110 | fclose( mOutFile ); 111 | #else 112 | mOutStream.close( ); 113 | #endif 114 | } 115 | 116 | void FrameRecorder::Write( const void * buffer, size_t size ) 117 | { 118 | #if MOC_RECORDER_USE_STDIO_FILE 119 | fwrite( buffer, 1, size, mOutFile ); 120 | #else 121 | mOutStream.write( (const char *)buffer, size ); 122 | #endif 123 | } 124 | 125 | void FrameRecorder::WriteTriangleRecording( MaskedOcclusionCulling::CullingResult cullingResult, const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, MaskedOcclusionCulling::ClipPlanes clipPlaneMask, MaskedOcclusionCulling::BackfaceWinding bfWinding, const MaskedOcclusionCulling::VertexLayout & vtxLayout ) 126 | { 127 | // write culling result 128 | Write( &cullingResult, sizeof( cullingResult ) ); 129 | 130 | unsigned int minVIndex = 0xffffffff; 131 | unsigned int maxVIndex = 0; 132 | for( int i = 0; i < nTris; i++ ) 133 | { 134 | const unsigned int & a = inTris[i * 3 + 0]; 135 | const unsigned int & b = inTris[i * 3 + 1]; 136 | const unsigned int & c = inTris[i * 3 + 2]; 137 | minVIndex = std::min( std::min( minVIndex, a ), std::min( b, c ) ); 138 | maxVIndex = std::max( std::max( maxVIndex, a ), std::max( b, c ) ); 139 | } 140 | 141 | // write actually used vertex count 142 | int vertexCount = ( maxVIndex < minVIndex ) ? ( 0 ) : ( maxVIndex - minVIndex + 1 ); 143 | Write( &vertexCount, sizeof( vertexCount ) ); 144 | 145 | // nothing more to write? early exit 146 | if( vertexCount == 0 ) 147 | return; 148 | 149 | // write vertex size 150 | int vertexSize = vtxLayout.mStride; 151 | Write( &vertexSize, sizeof( vertexSize ) ); 152 | 153 | // write vertices 154 | Write( ( (const char*)inVtx ) + minVIndex*vertexSize, vertexSize * ( vertexCount ) ); 155 | 156 | // write triangle count 157 | Write( &nTris, sizeof( nTris ) ); 158 | 159 | // write indices with adjusted offset 160 | for( int i = 0; i < nTris; i++ ) 161 | { 162 | unsigned int triangleIndices[3]; 163 | triangleIndices[0] = inTris[i * 3 + 0] - minVIndex; 164 | triangleIndices[1] = inTris[i * 3 + 1] - minVIndex; 165 | triangleIndices[2] = inTris[i * 3 + 2] - minVIndex; 166 | Write( triangleIndices, sizeof( triangleIndices ) ); 167 | } 168 | 169 | // write model to clip matrix (if any) 170 | char hasMatrix = ( modelToClipMatrix != nullptr ) ? ( 1 ) : ( 0 ); 171 | Write( &hasMatrix, sizeof( hasMatrix ) ); 172 | if( hasMatrix ) 173 | Write( modelToClipMatrix, 16 * sizeof( float ) ); 174 | 175 | Write( &clipPlaneMask, sizeof( clipPlaneMask ) ); 176 | 177 | Write( &bfWinding, sizeof( bfWinding ) ); 178 | 179 | // write vertex layout 180 | Write( &vtxLayout, sizeof( vtxLayout ) ); 181 | } 182 | 183 | namespace 184 | { 185 | // Warning, takes ownership of the underlying stream and closes it at the end 186 | struct InStreamWrapper 187 | { 188 | #if MOC_RECORDER_USE_STDIO_FILE 189 | FILE * mInFile; 190 | #else 191 | std::ifstream mInStream; 192 | #endif 193 | 194 | #if MOC_RECORDER_USE_STDIO_FILE 195 | InStreamWrapper( FILE *& inFile ) { mInFile = inFile; assert( mInFile != 0 ); inFile = 0; } 196 | ~InStreamWrapper( ) { fclose( mInFile ); } 197 | #else 198 | InStreamWrapper( std::ifstream && inStream ) : mInStream( std::move( inStream ) ) { assert( mInStream.is_open() ); } 199 | #endif 200 | 201 | size_t Read( void * buffer, size_t size ) 202 | { 203 | #if MOC_RECORDER_USE_STDIO_FILE 204 | return fread( buffer, 1, size, mInFile ); 205 | #else 206 | mInStream.read( (char*)buffer, size ); 207 | return mInStream.gcount( ); 208 | #endif 209 | } 210 | }; 211 | } 212 | 213 | void FrameRecorder::RecordClearBuffer( ) 214 | { 215 | char header = 3; 216 | Write( &header, 1 ); 217 | } 218 | 219 | void FrameRecorder::RecordRenderTriangles( MaskedOcclusionCulling::CullingResult cullingResult, const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, MaskedOcclusionCulling::ClipPlanes clipPlaneMask, MaskedOcclusionCulling::BackfaceWinding bfWinding, const MaskedOcclusionCulling::VertexLayout & vtxLayout ) 220 | { 221 | char header = 0; 222 | Write( &header, 1 ); 223 | WriteTriangleRecording( cullingResult, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout ); 224 | } 225 | 226 | void FrameRecorder::RecordTestRect( MaskedOcclusionCulling::CullingResult cullingResult, float xmin, float ymin, float xmax, float ymax, float wmin ) 227 | { 228 | char header = 1; 229 | Write( &header, 1 ); 230 | 231 | Write( &cullingResult, sizeof( cullingResult ) ); 232 | Write( &xmin, sizeof( xmin ) ); 233 | Write( &ymin, sizeof( ymin ) ); 234 | Write( &xmax, sizeof( xmax ) ); 235 | Write( &ymax, sizeof( ymax ) ); 236 | Write( &wmin, sizeof( wmin ) ); 237 | } 238 | 239 | void FrameRecorder::RecordTestTriangles( MaskedOcclusionCulling::CullingResult cullingResult, const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, MaskedOcclusionCulling::ClipPlanes clipPlaneMask, MaskedOcclusionCulling::BackfaceWinding bfWinding, const MaskedOcclusionCulling::VertexLayout & vtxLayout ) 240 | { 241 | char header = 2; 242 | Write( &header, 1 ); 243 | WriteTriangleRecording( cullingResult, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout ); 244 | } 245 | 246 | 247 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 248 | // Masked occlusion culling recording 249 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 250 | 251 | 252 | #if MOC_RECORDER_ENABLE_PLAYBACK 253 | 254 | static bool ReadTriangleRecording( FrameRecording::TrianglesEntry & outEntry, InStreamWrapper & inStream ) 255 | { 256 | // read culling result 257 | if( inStream.Read( (char*)&outEntry.mCullingResult, sizeof( outEntry.mCullingResult ) ) != sizeof( outEntry.mCullingResult ) ) 258 | { 259 | assert( false ); 260 | return false; 261 | } 262 | 263 | // read used vertex count 264 | int vertexCount = 0; 265 | if( inStream.Read( (char*)&vertexCount, sizeof( vertexCount ) ) != sizeof( vertexCount ) ) 266 | { 267 | assert( false ); 268 | return false; 269 | } 270 | 271 | // nothing in the recording? that's ok, just exit 272 | if( vertexCount == 0 ) 273 | { 274 | outEntry.mVertices.clear( ); 275 | outEntry.mTriangles.clear( ); 276 | return true; 277 | } 278 | 279 | 280 | // read vertex size 281 | int vertexSize = 0; 282 | if( inStream.Read( (char*)&vertexSize, sizeof( vertexSize ) ) != sizeof( vertexSize ) ) 283 | { 284 | assert( false ); 285 | return false; 286 | } 287 | 288 | // read vertices 289 | outEntry.mVertices.resize( vertexSize / 4 * vertexCount ); // pre-allocate data 290 | if( inStream.Read( (char*)outEntry.mVertices.data( ), vertexSize * vertexCount ) != ( vertexSize * vertexCount ) ) 291 | { 292 | assert( false ); 293 | return false; 294 | } 295 | 296 | // read triangle count 297 | int triangleCount = 0; 298 | if( inStream.Read( (char*)&triangleCount, sizeof( triangleCount ) ) != sizeof( triangleCount ) ) 299 | { 300 | assert( false ); 301 | return false; 302 | } 303 | 304 | outEntry.mTriangles.resize( triangleCount * 3 ); 305 | if( inStream.Read( (char*)outEntry.mTriangles.data( ), triangleCount * 3 * 4 ) != ( triangleCount * 3 * 4 ) ) 306 | { 307 | assert( false ); 308 | return false; 309 | } 310 | 311 | // read matrix (if any) 312 | char hasMatrix = 0; 313 | if( inStream.Read( (char*)&hasMatrix, sizeof( hasMatrix ) ) != sizeof( hasMatrix ) ) 314 | { 315 | assert( false ); 316 | return false; 317 | } 318 | 319 | if( ( outEntry.mHasModelToClipMatrix = ( hasMatrix != 0 ) ) ) 320 | { 321 | if( inStream.Read( (char*)outEntry.mModelToClipMatrix, 16 * sizeof( float ) ) != 16 * sizeof( float ) ) 322 | { 323 | assert( false ); 324 | return false; 325 | } 326 | } 327 | else 328 | { 329 | memset( outEntry.mModelToClipMatrix, 0, 16 * sizeof( float ) ); 330 | } 331 | 332 | if( inStream.Read( (char*)&outEntry.mClipPlaneMask, sizeof( outEntry.mClipPlaneMask ) ) != sizeof( outEntry.mClipPlaneMask ) ) 333 | { 334 | assert( false ); 335 | return false; 336 | } 337 | 338 | // read triangle cull winding 339 | if( inStream.Read( (char*)&outEntry.mbfWinding, sizeof( outEntry.mbfWinding ) ) != sizeof( outEntry.mbfWinding ) ) 340 | { 341 | assert( false ); 342 | return false; 343 | } 344 | 345 | // read vertex layout 346 | if( inStream.Read( (char*)&outEntry.mVertexLayout, sizeof( outEntry.mVertexLayout ) ) != sizeof( outEntry.mVertexLayout ) ) 347 | { 348 | assert( false ); 349 | return false; 350 | } 351 | if( outEntry.mVertexLayout.mStride != vertexSize ) 352 | { 353 | assert( false ); 354 | return false; 355 | } 356 | return true; 357 | } 358 | 359 | bool FrameRecording::Load( const char * inputFilePath, FrameRecording & outRecording ) 360 | { 361 | outRecording.Reset(); 362 | 363 | #if MOC_RECORDER_USE_STDIO_FILE 364 | FILE * inIOFile = 0; 365 | if( fopen_s( &inIOFile, inputFilePath, "rb" ) != 0 ) 366 | return false; 367 | InStreamWrapper inStream( inIOFile ); 368 | #else 369 | std::ifstream inIOStream( inputFilePath, std::ios::binary ); 370 | if( !inIOStream.is_open( ) ) 371 | { 372 | return false; 373 | } 374 | InStreamWrapper inStream( std::move(inIOStream) ); 375 | #endif 376 | 377 | // for file verification purposes 378 | unsigned int fileHeader = 0; 379 | if( ( inStream.Read( (char *)&fileHeader, sizeof( fileHeader ) ) != 4 ) || (fileHeader != 0xA701B600) ) 380 | { 381 | assert( false ); 382 | return false; 383 | } 384 | 385 | if( inStream.Read( (char *)&outRecording.mNearClipPlane, sizeof( outRecording.mNearClipPlane ) ) != sizeof( outRecording.mNearClipPlane ) ) { assert( false ); return false; } 386 | if( inStream.Read( (char *)&outRecording.mResolutionWidth, sizeof( outRecording.mResolutionWidth ) ) != sizeof( outRecording.mResolutionWidth ) ) { assert( false ); return false; } 387 | if( inStream.Read( (char *)&outRecording.mResolutionHeight, sizeof( outRecording.mResolutionHeight ) ) != sizeof( outRecording.mResolutionHeight ) ) { assert( false ); return false; } 388 | 389 | bool continueLoading = true; 390 | while( continueLoading ) 391 | { 392 | char chunkHeader = 0; 393 | if( inStream.Read( (char *)&chunkHeader, sizeof( chunkHeader ) ) != 1 ) 394 | { 395 | assert( false ); 396 | outRecording.Reset(); 397 | return false; 398 | } 399 | switch( chunkHeader ) 400 | { 401 | case( 0 ): // RenderTriangles 402 | case( 2 ): // TestTriangles 403 | { 404 | outRecording.mTriangleEntries.push_back( TrianglesEntry() ); 405 | int triangleEntryIndex = (int)outRecording.mTriangleEntries.size( )-1; 406 | if( !ReadTriangleRecording( outRecording.mTriangleEntries[triangleEntryIndex], inStream ) ) 407 | { 408 | assert( false ); 409 | outRecording.Reset( ); 410 | return false; 411 | } 412 | outRecording.mPlaybackOrder.push_back( std::make_pair( chunkHeader, triangleEntryIndex ) ); 413 | } break; 414 | case( 1 ): // TestRect 415 | { 416 | outRecording.mRectEntries.push_back( RectEntry( ) ); 417 | int rectEntryIndex = (int)outRecording.mRectEntries.size( )-1; 418 | 419 | // read rectangle in one go 420 | if( inStream.Read( (char*)&outRecording.mRectEntries[rectEntryIndex], sizeof( outRecording.mRectEntries[rectEntryIndex] ) ) != sizeof( outRecording.mRectEntries[rectEntryIndex] ) ) 421 | { 422 | assert( false ); 423 | outRecording.Reset( ); 424 | return false; 425 | } 426 | outRecording.mPlaybackOrder.push_back( std::make_pair( chunkHeader, rectEntryIndex ) ); 427 | } break; 428 | case( 3 ): // ClearBuffer 429 | { 430 | outRecording.mPlaybackOrder.push_back( std::make_pair( 3, -1 ) ); 431 | } break; 432 | case( 0x7F ): // eOF 433 | { 434 | continueLoading = false; 435 | return true; 436 | } break; 437 | default: 438 | { 439 | assert( false ); 440 | outRecording.Reset( ); 441 | return false; 442 | } 443 | } 444 | } 445 | 446 | assert( false ); // we should never get here 447 | return true; 448 | } 449 | 450 | #endif // #if MOC_RECORDER_ENABLE_PLAYBACK 451 | 452 | #endif // #if MOC_RECORDER_ENABLE -------------------------------------------------------------------------------- /FrameRecorder.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #pragma once 17 | 18 | /*! 19 | * \file FrameRecorder.h 20 | * \brief Masked occlusion culling recorder class (set MOC_RECORDER_ENABLE to 1 to enable) 21 | * 22 | * Masked occlusion culling recorder class (To enable, set MOC_RECORDER_ENABLE to 1 in MaskedOcclusionCulling.h) 23 | * 24 | * Enables gathering and storing all triangle rendering and all testing calls and their results to a file, for 25 | * later playback and performance testing. 26 | * Usage info: 27 | * - Calling MaskedOcclusionCulling::RecorderStart with a file name will open the file and start recording all subsequent 28 | * triangle rendering and any testing calls including the test results, and MaskedOcclusionCulling::RecorderStop will 29 | * stop recording and close the file. 30 | * - ClearBuffer-s are not recorded so if recording multiple frames, Start/RecorderStop is needed for each frame. For 31 | * correctness testing, the recording should be started around or after ClearBuffer, before any other "Render" calls. 32 | * - BinTriangles and RenderTrilist calls are NOT recorded; If using a custom multithreaded rendering, one should 33 | * record input triangles by manually calling MaskedOcclusionCulling::RecordRenderTriangles - see 34 | * CullingThreadpool::RenderTriangles for an example. 35 | * This is done intentionally in order to get raw input triangles so we can performance test and optimize various 36 | * thread pool approaches. 37 | */ 38 | 39 | #include "MaskedOcclusionCulling.h" 40 | 41 | #ifndef MOC_RECORDER_USE_STDIO_FILE 42 | /*! 43 | * Whether to use FILE or std::ofstream/istream for file access (to avoid compatibility issues in some environments) 44 | */ 45 | #define MOC_RECORDER_USE_STDIO_FILE 1 46 | #endif 47 | 48 | #if MOC_RECORDER_ENABLE 49 | 50 | #if MOC_RECORDER_USE_STDIO_FILE 51 | #include 52 | #else 53 | #include 54 | #endif 55 | 56 | #include 57 | 58 | class FrameRecorder 59 | { 60 | #if MOC_RECORDER_USE_STDIO_FILE 61 | FILE * mOutFile; 62 | #else 63 | std::ofstream mOutStream; 64 | #endif 65 | 66 | protected: 67 | friend class MaskedOcclusionCulling; 68 | #if MOC_RECORDER_USE_STDIO_FILE 69 | FrameRecorder( FILE *& outFile, const MaskedOcclusionCulling & moc ); 70 | #else 71 | FrameRecorder( std::ofstream && outStream, const MaskedOcclusionCulling & moc ); 72 | #endif 73 | 74 | public: 75 | ~FrameRecorder( ); 76 | 77 | protected: 78 | void Write( const void * buffer, size_t size ); 79 | void WriteTriangleRecording( MaskedOcclusionCulling::CullingResult cullingResult, const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, MaskedOcclusionCulling::ClipPlanes clipPlaneMask, MaskedOcclusionCulling::BackfaceWinding bfWinding, const MaskedOcclusionCulling::VertexLayout & vtxLayout ); 80 | 81 | public: 82 | void RecordClearBuffer( ); 83 | void RecordRenderTriangles( MaskedOcclusionCulling::CullingResult cullingResult, const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, MaskedOcclusionCulling::ClipPlanes clipPlaneMask, MaskedOcclusionCulling::BackfaceWinding bfWinding, const MaskedOcclusionCulling::VertexLayout &vtxLayout ); 84 | void RecordTestRect( MaskedOcclusionCulling::CullingResult cullingResult, float xmin, float ymin, float xmax, float ymax, float wmin ); 85 | void RecordTestTriangles( MaskedOcclusionCulling::CullingResult cullingResult, const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, MaskedOcclusionCulling::ClipPlanes clipPlaneMask, MaskedOcclusionCulling::BackfaceWinding bfWinding, const MaskedOcclusionCulling::VertexLayout &vtxLayout ); 86 | }; 87 | 88 | #if MOC_RECORDER_ENABLE_PLAYBACK 89 | #include 90 | 91 | #if 0 // For future use - in case all vector uses below need conversion to custom allocator 92 | template 93 | struct MOCVectorAllocator 94 | { 95 | const MaskedOcclusionCulling::pfnAlignedAlloc m_alloc; 96 | const MaskedOcclusionCulling::pfnAlignedFree m_free; 97 | typedef T value_type; 98 | MOCVectorAllocator( ) = delete; 99 | MOCVectorAllocator( MaskedOcclusionCulling::pfnAlignedAlloc alloc, MaskedOcclusionCulling::pfnAlignedFree free ) noexcept : m_alloc( alloc ), m_free( free ) { } 100 | template constexpr MOCVectorAllocator( const MOCVectorAllocator& c ) noexcept : m_alloc( c.m_alloc ), m_free( c.m_free ) {} 101 | T* allocate( std::size_t n ) 102 | { 103 | if( n > std::size_t( -1 ) / sizeof( T ) ) throw std::bad_alloc( ); 104 | if( auto p = static_cast( m_alloc( 64, n * sizeof( T ) ) ) ) return p; 105 | throw std::bad_alloc( ); 106 | } 107 | void deallocate( T* p, std::size_t ) noexcept 108 | { 109 | m_free( p ); 110 | } 111 | }; 112 | template 113 | inline bool operator==( const MOCVectorAllocator&, const MOCVectorAllocator& ) { return true; } 114 | template 115 | inline bool operator!=( const MOCVectorAllocator&, const MOCVectorAllocator& ) { return false; } 116 | #endif 117 | 118 | struct FrameRecording 119 | { 120 | struct TrianglesEntry 121 | { 122 | MaskedOcclusionCulling::CullingResult mCullingResult; 123 | std::vector mVertices; 124 | std::vector mTriangles; 125 | float mModelToClipMatrix[16]; 126 | MaskedOcclusionCulling::BackfaceWinding mbfWinding; 127 | MaskedOcclusionCulling::VertexLayout mVertexLayout; 128 | MaskedOcclusionCulling::ClipPlanes mClipPlaneMask; 129 | bool mHasModelToClipMatrix; 130 | bool mHasScissorRect; 131 | }; 132 | struct RectEntry 133 | { 134 | MaskedOcclusionCulling::CullingResult mCullingResult; 135 | float mXMin; 136 | float mYMin; 137 | float mXMax; 138 | float mYMax; 139 | float mWMin; 140 | }; 141 | 142 | // list of type&index pairs for playback ( type 0 is RenderTriangles, type 1 is TestRect, type 2 is TestTriangles, type 3 is ClearBuffer ) 143 | std::vector< std::pair< char, int > > mPlaybackOrder; 144 | 145 | std::vector< TrianglesEntry > mTriangleEntries; 146 | std::vector< RectEntry > mRectEntries; 147 | 148 | float mNearClipPlane; 149 | unsigned int mResolutionWidth; 150 | unsigned int mResolutionHeight; 151 | 152 | FrameRecording( ) = default; 153 | FrameRecording( const FrameRecording & other ) = default; 154 | 155 | FrameRecording( FrameRecording && other ) 156 | { 157 | mPlaybackOrder = std::move( other.mPlaybackOrder ); 158 | mTriangleEntries = std::move( other.mTriangleEntries ); 159 | mRectEntries = std::move( other.mRectEntries ); 160 | mNearClipPlane = other.mNearClipPlane; 161 | mResolutionWidth = other.mResolutionWidth; 162 | mResolutionHeight = other.mResolutionHeight; 163 | } 164 | 165 | void Reset( ) 166 | { 167 | mPlaybackOrder.clear(); 168 | mTriangleEntries.clear(); 169 | mRectEntries.clear(); 170 | mNearClipPlane = 0.0f; 171 | mResolutionWidth = 0; 172 | mResolutionHeight = 0; 173 | } 174 | 175 | static bool Load( const char * inputFilePath, FrameRecording & outRecording ); 176 | }; 177 | 178 | #endif // #if MOC_RECORDER_ENABLE_PLAYBACK 179 | 180 | #endif // #if MOC_RECORDER_ENABLE -------------------------------------------------------------------------------- /FrameRecorderPlayer/FrameRecorderPlayer.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "../CullingThreadpool.h" 30 | #include "../MaskedOcclusionCulling.h" 31 | #include "../FrameRecorder.h" 32 | 33 | #if !MOC_RECORDER_ENABLE 34 | #error This project needs to be compiled with MOC_RECORDER_ENABLE set to 1 35 | #endif 36 | 37 | #if !MOC_RECORDER_ENABLE_PLAYBACK 38 | #error This project needs to be compiled with MOC_RECORDER_ENABLE_PLAYBACK set to 1 39 | #endif 40 | 41 | //////////////////////////////////////////////////////////////////////////////////////// 42 | // Image utility functions, minimal BMP writer and depth buffer tone mapping 43 | //////////////////////////////////////////////////////////////////////////////////////// 44 | 45 | static void WriteBMP( const char *filename, const unsigned char *data, int w, int h ) 46 | { 47 | short header[] = { 0x4D42, 0, 0, 0, 0, 26, 0, 12, 0, (short)w, (short)h, 1, 24 }; 48 | #pragma warning ( suppress : 4996 ) 49 | FILE * f; 50 | if( fopen_s( &f, filename, "wb" ) == 0 ) 51 | { 52 | fwrite( header, 1, sizeof( header ), f ); 53 | fwrite( data, 1, w * h * 3, f ); 54 | fclose( f ); 55 | } 56 | else 57 | { 58 | printf( "\nError trying to save to %s", filename ); 59 | } 60 | } 61 | 62 | static void TonemapDepth( float *depth, unsigned char *image, int w, int h ) 63 | { 64 | // Find min/max w coordinate (discard cleared pixels) 65 | float minW = FLT_MAX, maxW = 0.0f; 66 | for( int i = 0; i < w*h; ++i ) 67 | { 68 | if( depth[i] > 0.0f ) 69 | { 70 | minW = std::min( minW, depth[i] ); 71 | maxW = std::max( maxW, depth[i] ); 72 | } 73 | } 74 | 75 | // Tonemap depth values 76 | for( int i = 0; i < w*h; ++i ) 77 | { 78 | int intensity = 0; 79 | if( depth[i] > 0 ) 80 | intensity = (unsigned char)( 223.0*( depth[i] - minW ) / ( maxW - minW ) + 32.0 ); 81 | 82 | image[i * 3 + 0] = intensity; 83 | image[i * 3 + 1] = intensity; 84 | image[i * 3 + 2] = intensity; 85 | } 86 | } 87 | 88 | struct BenchStats 89 | { 90 | double Time; 91 | int ClearCount; 92 | int64_t TriangleCount; 93 | int64_t RectTestCount; 94 | }; 95 | 96 | BenchStats BenchmarkRecording( FrameRecording & recording, MaskedOcclusionCulling & moc, const int loopCount, const bool includeTests = false ) 97 | { 98 | assert( !includeTests ); // not yet implemented 99 | 100 | BenchStats stats; memset( &stats, 0, sizeof(stats ) ); 101 | 102 | auto before = std::chrono::high_resolution_clock::now(); 103 | 104 | moc.SetNearClipPlane( recording.mNearClipPlane ); 105 | 106 | for( int loop = 0; loop < loopCount; loop++ ) 107 | { 108 | // enforce first clear 109 | moc.ClearBuffer( ); 110 | stats.ClearCount++; 111 | 112 | for( int i = 0; i < recording.mPlaybackOrder.size( ); i++ ) 113 | { 114 | char elementType = recording.mPlaybackOrder[i].first; 115 | int elementIndex = recording.mPlaybackOrder[i].second; 116 | switch( elementType ) 117 | { 118 | case( 0 ): // RenderTriangles 119 | { 120 | const FrameRecording::TrianglesEntry & triangleEntry = recording.mTriangleEntries[elementIndex]; 121 | moc.RenderTriangles( triangleEntry.mVertices.data( ), triangleEntry.mTriangles.data( ), (int)triangleEntry.mTriangles.size( ) / 3, ( triangleEntry.mHasModelToClipMatrix ) ? ( triangleEntry.mModelToClipMatrix ) : ( nullptr ), triangleEntry.mbfWinding, triangleEntry.mClipPlaneMask, triangleEntry.mVertexLayout ); 122 | } break; 123 | case( 1 ): // TestRect 124 | { 125 | } break; 126 | case( 2 ): // TestTriangles 127 | { 128 | } break; 129 | case( 3 ): // ClearBuffer 130 | { 131 | if( i != 0 ) // skip if first clear because we enforced that anyway 132 | { 133 | moc.ClearBuffer( ); 134 | stats.ClearCount++; 135 | } 136 | } break; 137 | default: assert( false ); 138 | }; 139 | } 140 | } 141 | 142 | auto after = std::chrono::high_resolution_clock::now(); 143 | 144 | stats.Time = std::chrono::duration( after - before ).count( ); 145 | 146 | return stats; 147 | } 148 | 149 | BenchStats BenchmarkRecording( FrameRecording & recording, CullingThreadpool & mocThreadpool, const int loopCount, const bool includeTests = false ) 150 | { 151 | assert( !includeTests ); // not yet implemented 152 | 153 | BenchStats stats; memset( &stats, 0, sizeof( stats ) ); 154 | 155 | auto before = std::chrono::high_resolution_clock::now( ); 156 | 157 | mocThreadpool.SetNearClipPlane( recording.mNearClipPlane ); 158 | 159 | for( int loop = 0; loop < loopCount; loop++ ) 160 | { 161 | // enforce first clear 162 | mocThreadpool.ClearBuffer( ); 163 | stats.ClearCount++; 164 | 165 | for( int i = 0; i < recording.mPlaybackOrder.size( ); i++ ) 166 | { 167 | char elementType = recording.mPlaybackOrder[i].first; 168 | int elementIndex = recording.mPlaybackOrder[i].second; 169 | switch( elementType ) 170 | { 171 | case( 0 ): // RenderTriangles 172 | { 173 | const FrameRecording::TrianglesEntry & triangleEntry = recording.mTriangleEntries[elementIndex]; 174 | mocThreadpool.SetMatrix( ( triangleEntry.mHasModelToClipMatrix ) ? ( triangleEntry.mModelToClipMatrix ) : ( nullptr ) ); 175 | mocThreadpool.SetVertexLayout( triangleEntry.mVertexLayout ); 176 | assert( !triangleEntry.mHasScissorRect ); // can't use scissor rect in multithreaded scenario because it's already used by the binning part of the algorithm 177 | mocThreadpool.RenderTriangles( triangleEntry.mVertices.data( ), triangleEntry.mTriangles.data( ), (int)triangleEntry.mTriangles.size( ) / 3, triangleEntry.mbfWinding ); 178 | stats.TriangleCount += triangleEntry.mTriangles.size( ); 179 | } break; 180 | case( 1 ): // TestRect 181 | { 182 | } break; 183 | case( 2 ): // TestTriangles 184 | { 185 | } break; 186 | case( 3 ): // ClearBuffer 187 | { 188 | if( i != 0 ) // skip if first clear because we enforced that anyway 189 | { 190 | mocThreadpool.ClearBuffer( ); 191 | stats.ClearCount++; 192 | } 193 | } break; 194 | default: assert( false ); break; 195 | }; 196 | } 197 | mocThreadpool.Flush( ); 198 | } 199 | 200 | auto after = std::chrono::high_resolution_clock::now(); 201 | 202 | stats.Time = std::chrono::duration(after - before).count(); 203 | 204 | return stats; 205 | } 206 | 207 | 208 | /* 209 | double BenchmarkTrianglesD3D(ID3D11Buffer *buf, int numTriangles, bool color) 210 | { 211 | // set the render target as the back buffer 212 | if (color) 213 | context->OMSetRenderTargets(1, &textureRTV, textureDSV); 214 | else 215 | context->OMSetRenderTargets(0, nullptr, textureDSV); 216 | 217 | // clear the back buffer to a deep blue 218 | float clearColor[4] = { 0.0f, 1.0f, 0.0f, 1.0f }; 219 | context->ClearRenderTargetView(textureRTV, clearColor); 220 | context->ClearDepthStencilView(textureDSV, D3D11_CLEAR_DEPTH, 1.0f, 0); 221 | 222 | // Setup primitivelist 223 | UINT stride = sizeof(float) * 4, offset = 0; 224 | context->IASetVertexBuffers(0, 1, &buf, &stride, &offset); 225 | context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); 226 | context->Flush(); 227 | 228 | // Draw triangles 229 | auto before = std::chrono::high_resolution_clock::now(); 230 | context->Begin(endQuery); 231 | context->Draw(numTriangles * 3, 0); 232 | context->End(endQuery); 233 | while (context->GetData(endQuery, nullptr, 0, 0) == S_FALSE) {} 234 | auto after = std::chrono::high_resolution_clock::now(); 235 | 236 | return std::chrono::duration(after - before).count(); 237 | } 238 | */ 239 | 240 | //////////////////////////////////////////////////////////////////////////////////////// 241 | // Perform basic fillrate benchmarks for GPU and compare with this libray 242 | //////////////////////////////////////////////////////////////////////////////////////// 243 | 244 | int main(int argc, char* argv[]) 245 | { 246 | // settings 247 | const int width = 1920; 248 | const int height = 1080; 249 | const float nearClip = 0.1f; 250 | const int benchLoopCount = 1000; 251 | 252 | // Flush denorms to zero to avoid performance issues with small values 253 | _mm_setcsr(_mm_getcsr() | 0x8040); 254 | 255 | MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(); 256 | 257 | // Initialize directx 258 | // InitD3D(width, height); 259 | 260 | // Setup and state related code 261 | //////////////////////////////////////////////////////////////////////////////////////// 262 | 263 | // Setup a rendertarget and near clip plane 264 | moc->SetResolution(width, height); 265 | moc->SetNearClipPlane(nearClip); 266 | 267 | printf( "Masked Occlusion Culling performance tester\n" ); 268 | 269 | printf( "Compiler: " ); 270 | #ifdef __clang__ 271 | printf( "clang/llvm\n" ); 272 | #else 273 | printf( "msvc\n" ); 274 | #endif 275 | 276 | printf( "Instruction set: " ); 277 | if( moc->GetImplementation( ) == MaskedOcclusionCulling::SSE2 ) printf( "SSE2\n" ); 278 | if( moc->GetImplementation( ) == MaskedOcclusionCulling::SSE41 ) printf( "SSE41\n" ); 279 | if( moc->GetImplementation( ) == MaskedOcclusionCulling::AVX2 ) printf( "AVX2\n" ); 280 | if( moc->GetImplementation( ) == MaskedOcclusionCulling::AVX512 ) printf( "AVX512\n" ); 281 | 282 | printf( "\nMOC resolution used: %d x %d\n", width, height ); 283 | 284 | namespace fs = ::std::experimental::filesystem; 285 | 286 | std::vector< std::pair< std::string, FrameRecording > > recordedFiles; 287 | 288 | printf( "\nLoading all '*.mocrec' files in the working directory...\n" ); 289 | for( auto & p : fs::directory_iterator( "./" ) ) 290 | { 291 | if( fs::is_regular_file( p ) && p.path( ).extension( ) == ".mocrec" ) 292 | { 293 | std::string fileName = p.path( ).string( ); 294 | FrameRecording record; 295 | if( FrameRecording::Load( fileName.c_str(), record ) ) 296 | { 297 | recordedFiles.push_back( std::make_pair( fileName, std::move(record) ) ); 298 | printf( " loaded dataset %d from '%s': OK\n", int(recordedFiles.size()-1), fileName.c_str( ) ); 299 | } 300 | else 301 | { 302 | printf( " loading of '%s' failed!\n", fileName.c_str() ); 303 | } 304 | } 305 | } 306 | 307 | printf( "\nSaving playback screenshots for loaded recording entries\n" ); 308 | for( int i = 0; i < (int)recordedFiles.size(); i++ ) 309 | { 310 | auto & entry = recordedFiles[i]; 311 | moc->ClearBuffer(); 312 | 313 | // Save previously loaded recording entries - useful to keep existing data alive when changing/expanding storage format 314 | #define RESAVE_ENTRIES 0 315 | #if RESAVE_ENTRIES != 0 316 | moc->RecorderStart( (entry.first).c_str() ); 317 | #endif 318 | const FrameRecording & recording = entry.second; 319 | moc->SetNearClipPlane(recording.mNearClipPlane); 320 | for( int i = 0; i < recording.mPlaybackOrder.size(); i++ ) 321 | { 322 | char elementType = recording.mPlaybackOrder[i].first; 323 | int elementIndex = recording.mPlaybackOrder[i].second; 324 | switch( elementType ) 325 | { 326 | case( 0 ): // RenderTriangles 327 | { 328 | const FrameRecording::TrianglesEntry & triangleEntry = recording.mTriangleEntries[elementIndex]; 329 | moc->RenderTriangles( triangleEntry.mVertices.data(), triangleEntry.mTriangles.data(), (int)triangleEntry.mTriangles.size()/3, (triangleEntry.mHasModelToClipMatrix)?(triangleEntry.mModelToClipMatrix):(nullptr), triangleEntry.mbfWinding, triangleEntry.mClipPlaneMask, triangleEntry.mVertexLayout ); 330 | } break; 331 | case( 1 ): // TestRect 332 | { 333 | } break; 334 | case( 2 ): // TestTriangles 335 | { 336 | } break; 337 | case( 3 ): // ClearBuffer 338 | { 339 | moc->ClearBuffer( ); 340 | } break; 341 | default: assert( false ); 342 | } 343 | } 344 | #if RESAVE_ENTRIES != 0 345 | moc->RecorderStop(); 346 | #endif 347 | 348 | char fileName[1024]; sprintf_s( fileName, sizeof( fileName ), "%s.bmp", entry.first.c_str() ); 349 | 350 | // Compute a per pixel depth buffer from the hierarchical depth buffer, used for visualization. 351 | float *perPixelZBuffer = new float[width * height]; 352 | moc->ComputePixelDepthBuffer( perPixelZBuffer, true ); 353 | 354 | // Tonemap the image 355 | unsigned char *image = new unsigned char[width * height * 3]; 356 | TonemapDepth( perPixelZBuffer, image, width, height ); 357 | WriteBMP( fileName, image, width, height ); 358 | delete[] image; 359 | 360 | printf( " %d - '%s' written.\n", i, fileName ); 361 | } 362 | 363 | 364 | //////////////////////////////////////////////////////////////////////////////////////// 365 | // Perform benchmarks 366 | //////////////////////////////////////////////////////////////////////////////////////// 367 | /* 368 | printf("\nD3D Z only\n"); 369 | printf("----\n"); 370 | for (int i = 0; i < numSizes; ++i) 371 | { 372 | int size = sizes[i]; 373 | double t = BenchmarkTrianglesD3D(vBuffers[i], numTriangles[i], false); 374 | double GPixelsPerSecond = (double)numTriangles[i] * size*size / (2.0 * 1e9 * t); 375 | double MTrisPerSecond = (double)numTriangles[i] / (1e6 * t); 376 | printf("Tri: %3dx%3d - Time: %7.2f ms, MTris/s: %6.2f GPixels/s: %5.2f \n", size, size, t * 1000.0f, MTrisPerSecond, GPixelsPerSecond); 377 | } 378 | */ 379 | 380 | std::vector singleThreadedTimes; 381 | singleThreadedTimes.resize( recordedFiles.size() ); 382 | 383 | printf("\nSingle threaded benchmark (%d loops of each frame capture)\n", benchLoopCount); 384 | for( int i = 0; i < (int)recordedFiles.size( ); i++ ) 385 | { 386 | auto & entry = recordedFiles[i]; 387 | BenchStats stats = BenchmarkRecording( entry.second, *moc, benchLoopCount ); 388 | 389 | float MTrisPerSecond = (float)((double)stats.TriangleCount / (1e6 * stats.Time)); 390 | printf(" %d - tris: %12" PRId64 ", MTris/s: %6.2f, total time: %9.3fms, single loop time: %6.3fms\n", i, stats.TriangleCount, MTrisPerSecond, float(stats.Time*1000.0f), float(stats.Time*1000.0/double(benchLoopCount)) ); 391 | 392 | singleThreadedTimes[i] = stats.Time; 393 | } 394 | printf( "----\n" ); 395 | 396 | int numThreads = std::thread::hardware_concurrency( ); 397 | printf( "\nMulti threaded benchmark (%d loops of each frame capture, %d threads)\n", benchLoopCount, numThreads ); 398 | CullingThreadpool ctp( numThreads, 2, numThreads ); 399 | ctp.SetBuffer( moc ); 400 | ctp.WakeThreads( ); 401 | for( int i = 0; i < (int)recordedFiles.size( ); i++ ) 402 | { 403 | auto & entry = recordedFiles[i]; 404 | BenchStats stats = BenchmarkRecording( entry.second, ctp, benchLoopCount ); 405 | 406 | float MTrisPerSecond = (float)( (double)stats.TriangleCount / ( 1e6 * stats.Time ) ); 407 | printf( " %d - tris: %12" PRId64 ", MTris/s: %6.2f, total time: %9.3fms, single loop time: %6.3fms, MT scaling: %.3fx\n", i, stats.TriangleCount, MTrisPerSecond, float( stats.Time*1000.0f ), float( stats.Time*1000.0 / double( benchLoopCount ) ), singleThreadedTimes[i] / stats.Time ); 408 | } 409 | ctp.SuspendThreads( ); 410 | printf( "----\n" ); 411 | } -------------------------------------------------------------------------------- /FrameRecorderPlayer/FrameRecorderPlayer.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FrameRecorderPlayer", "FrameRecorderPlayer.vcxproj", "{D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug_LLVM|x64 = Debug_LLVM|x64 11 | Debug|x64 = Debug|x64 12 | Release_LLVM|x64 = Release_LLVM|x64 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug_LLVM|x64.ActiveCfg = Debug_LLVM|x64 17 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug_LLVM|x64.Build.0 = Debug_LLVM|x64 18 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug|x64.ActiveCfg = Debug|x64 19 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Debug|x64.Build.0 = Debug|x64 20 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release_LLVM|x64.ActiveCfg = Release_LLVM|x64 21 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release_LLVM|x64.Build.0 = Release_LLVM|x64 22 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release|x64.ActiveCfg = Release|x64 23 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F}.Release|x64.Build.0 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /FrameRecorderPlayer/FrameRecorderPlayer.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug_LLVM 6 | x64 7 | 8 | 9 | Release_LLVM 10 | x64 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {D9BAD3FB-95A9-4AB4-9B2B-DAC2A103345F} 23 | Win32Proj 24 | FrameRecorderPlayer 25 | 10.0.16299.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v141 32 | Unicode 33 | 34 | 35 | Application 36 | true 37 | LLVM-vs2014 38 | Unicode 39 | 40 | 41 | Application 42 | false 43 | v141 44 | Unicode 45 | 46 | 47 | Application 48 | false 49 | LLVM-vs2014 50 | true 51 | Unicode 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | true 73 | 74 | 75 | true 76 | 77 | 78 | false 79 | 80 | 81 | false 82 | 83 | 84 | 85 | 86 | 87 | Level3 88 | Disabled 89 | MOC_RECORDER_ENABLE=1;MOC_RECORDER_ENABLE_PLAYBACK=1;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Console 93 | true 94 | 95 | 96 | 97 | 98 | 99 | 100 | Level3 101 | Disabled 102 | MOC_RECORDER_ENABLE=1;MOC_RECORDER_ENABLE_PLAYBACK=1;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 103 | -msse4.1 %(AdditionalOptions) 104 | 105 | 106 | Console 107 | true 108 | 109 | 110 | 111 | 112 | Level3 113 | 114 | 115 | MaxSpeed 116 | true 117 | true 118 | MOC_RECORDER_ENABLE=1;MOC_RECORDER_ENABLE_PLAYBACK=1;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 119 | 120 | 121 | Console 122 | true 123 | true 124 | true 125 | 126 | 127 | 128 | 129 | Level3 130 | 131 | 132 | MaxSpeed 133 | true 134 | true 135 | MOC_RECORDER_ENABLE=1;MOC_RECORDER_ENABLE_PLAYBACK=1;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 136 | -msse4.1 %(AdditionalOptions) 137 | 138 | 139 | Console 140 | true 141 | true 142 | true 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | AdvancedVectorExtensions2 151 | AdvancedVectorExtensions2 152 | AdvancedVectorExtensions2 153 | AdvancedVectorExtensions2 154 | 155 | 156 | AdvancedVectorExtensions2 157 | AdvancedVectorExtensions2 158 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 159 | false 160 | AdvancedVectorExtensions2 161 | AdvancedVectorExtensions2 162 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /FrameRecorderPlayer/FrameRecorderPlayer.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | MaskedOcclusionCulling 7 | 8 | 9 | MaskedOcclusionCulling 10 | 11 | 12 | MaskedOcclusionCulling 13 | 14 | 15 | MaskedOcclusionCulling 16 | 17 | 18 | MaskedOcclusionCulling 19 | 20 | 21 | 22 | 23 | MaskedOcclusionCulling 24 | 25 | 26 | MaskedOcclusionCulling 27 | 28 | 29 | MaskedOcclusionCulling 30 | 31 | 32 | 33 | 34 | {b848f282-5fc0-4806-b33e-649bedd84f85} 35 | 36 | 37 | 38 | 39 | MaskedOcclusionCulling 40 | 41 | 42 | -------------------------------------------------------------------------------- /FrameRecorderPlayer/OcclusionCulling_0.mocrec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GameTechDev/MaskedOcclusionCulling/f9fbd42412e0e272f5bc96c78efb2a7152f2956d/FrameRecorderPlayer/OcclusionCulling_0.mocrec -------------------------------------------------------------------------------- /FrameRecorderPlayer/OcclusionCulling_1.mocrec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GameTechDev/MaskedOcclusionCulling/f9fbd42412e0e272f5bc96c78efb2a7152f2956d/FrameRecorderPlayer/OcclusionCulling_1.mocrec -------------------------------------------------------------------------------- /FrameRecorderPlayer/OcclusionCulling_2.mocrec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GameTechDev/MaskedOcclusionCulling/f9fbd42412e0e272f5bc96c78efb2a7152f2956d/FrameRecorderPlayer/OcclusionCulling_2.mocrec -------------------------------------------------------------------------------- /MaskedOcclusionCullingAVX2.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2017 Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy 6 | // of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations 14 | // under the License. 15 | //////////////////////////////////////////////////////////////////////////////// 16 | #include 17 | #include 18 | #include 19 | #include "MaskedOcclusionCulling.h" 20 | #include "CompilerSpecific.inl" 21 | 22 | #if MOC_RECORDER_ENABLE 23 | #include "FrameRecorder.h" 24 | #endif 25 | 26 | #if defined(__MICROSOFT_COMPILER) && _MSC_VER < 1900 27 | // If you remove/comment this error, the code will compile & use the SSE41 version instead. 28 | #error Older versions than visual studio 2015 not supported due to compiler bug(s) 29 | #endif 30 | 31 | #if !defined(__MICROSOFT_COMPILER) || _MSC_VER >= 1900 32 | 33 | // For performance reasons, the MaskedOcclusionCullingAVX2.cpp file should be compiled with VEX encoding for SSE instructions (to avoid 34 | // AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, the SSE 35 | // version in MaskedOcclusionCulling.cpp _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to 36 | // use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files. 37 | #ifndef __AVX2__ 38 | #error For best performance, MaskedOcclusionCullingAVX2.cpp should be compiled with /arch:AVX2 39 | #endif 40 | 41 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 42 | // AVX specific defines and constants 43 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 44 | 45 | #define SIMD_LANES 8 46 | #define TILE_HEIGHT_SHIFT 3 47 | 48 | #define SIMD_LANE_IDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7) 49 | 50 | #define SIMD_SUB_TILE_COL_OFFSET _mm256_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) 51 | #define SIMD_SUB_TILE_ROW_OFFSET _mm256_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT) 52 | #define SIMD_SUB_TILE_COL_OFFSET_F _mm256_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) 53 | #define SIMD_SUB_TILE_ROW_OFFSET_F _mm256_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT) 54 | 55 | #define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF) 56 | 57 | #define SIMD_LANE_YCOORD_I _mm256_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920) 58 | #define SIMD_LANE_YCOORD_F _mm256_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f) 59 | 60 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 61 | // AVX specific typedefs and functions 62 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 63 | 64 | typedef __m256 __mw; 65 | typedef __m256i __mwi; 66 | 67 | #define _mmw_set1_ps _mm256_set1_ps 68 | #define _mmw_setzero_ps _mm256_setzero_ps 69 | #define _mmw_and_ps _mm256_and_ps 70 | #define _mmw_or_ps _mm256_or_ps 71 | #define _mmw_xor_ps _mm256_xor_ps 72 | #define _mmw_not_ps(a) _mm256_xor_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(~0))) 73 | #define _mmw_andnot_ps _mm256_andnot_ps 74 | #define _mmw_neg_ps(a) _mm256_xor_ps((a), _mm256_set1_ps(-0.0f)) 75 | #define _mmw_abs_ps(a) _mm256_and_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF))) 76 | #define _mmw_add_ps _mm256_add_ps 77 | #define _mmw_sub_ps _mm256_sub_ps 78 | #define _mmw_mul_ps _mm256_mul_ps 79 | #define _mmw_div_ps _mm256_div_ps 80 | #define _mmw_min_ps _mm256_min_ps 81 | #define _mmw_max_ps _mm256_max_ps 82 | #define _mmw_fmadd_ps _mm256_fmadd_ps 83 | #define _mmw_fmsub_ps _mm256_fmsub_ps 84 | #define _mmw_movemask_ps _mm256_movemask_ps 85 | #define _mmw_blendv_ps _mm256_blendv_ps 86 | #define _mmw_cmpge_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) 87 | #define _mmw_cmpgt_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) 88 | #define _mmw_cmpeq_ps(a,b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ) 89 | #define _mmw_floor_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) 90 | #define _mmw_ceil_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) 91 | #define _mmw_shuffle_ps _mm256_shuffle_ps 92 | #define _mmw_insertf32x4_ps _mm256_insertf128_ps 93 | #define _mmw_cvtepi32_ps _mm256_cvtepi32_ps 94 | #define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c))) 95 | 96 | #define _mmw_set1_epi32 _mm256_set1_epi32 97 | #define _mmw_setzero_epi32 _mm256_setzero_si256 98 | #define _mmw_and_epi32 _mm256_and_si256 99 | #define _mmw_or_epi32 _mm256_or_si256 100 | #define _mmw_xor_epi32 _mm256_xor_si256 101 | #define _mmw_not_epi32(a) _mm256_xor_si256((a), _mm256_set1_epi32(~0)) 102 | #define _mmw_andnot_epi32 _mm256_andnot_si256 103 | #define _mmw_neg_epi32(a) _mm256_sub_epi32(_mm256_set1_epi32(0), (a)) 104 | #define _mmw_add_epi32 _mm256_add_epi32 105 | #define _mmw_sub_epi32 _mm256_sub_epi32 106 | #define _mmw_min_epi32 _mm256_min_epi32 107 | #define _mmw_max_epi32 _mm256_max_epi32 108 | #define _mmw_subs_epu16 _mm256_subs_epu16 109 | #define _mmw_mullo_epi32 _mm256_mullo_epi32 110 | #define _mmw_cmpeq_epi32 _mm256_cmpeq_epi32 111 | #define _mmw_testz_epi32 _mm256_testz_si256 112 | #define _mmw_cmpgt_epi32 _mm256_cmpgt_epi32 113 | #define _mmw_srai_epi32 _mm256_srai_epi32 114 | #define _mmw_srli_epi32 _mm256_srli_epi32 115 | #define _mmw_slli_epi32 _mm256_slli_epi32 116 | #define _mmw_sllv_ones(x) _mm256_sllv_epi32(SIMD_BITS_ONE, x) 117 | #define _mmw_transpose_epi8(x) _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES) 118 | #define _mmw_abs_epi32 _mm256_abs_epi32 119 | #define _mmw_cvtps_epi32 _mm256_cvtps_epi32 120 | #define _mmw_cvttps_epi32 _mm256_cvttps_epi32 121 | 122 | #define _mmx_dp4_ps(a, b) _mm_dp_ps(a, b, 0xFF) 123 | #define _mmx_fmadd_ps _mm_fmadd_ps 124 | #define _mmx_max_epi32 _mm_max_epi32 125 | #define _mmx_min_epi32 _mm_min_epi32 126 | 127 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 128 | // SIMD casting functions 129 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 130 | 131 | template FORCE_INLINE T simd_cast(Y A); 132 | template<> FORCE_INLINE __m128 simd_cast<__m128>(float A) { return _mm_set1_ps(A); } 133 | template<> FORCE_INLINE __m128 simd_cast<__m128>(__m128i A) { return _mm_castsi128_ps(A); } 134 | template<> FORCE_INLINE __m128 simd_cast<__m128>(__m128 A) { return A; } 135 | template<> FORCE_INLINE __m128i simd_cast<__m128i>(int A) { return _mm_set1_epi32(A); } 136 | template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128 A) { return _mm_castps_si128(A); } 137 | template<> FORCE_INLINE __m128i simd_cast<__m128i>(__m128i A) { return A; } 138 | template<> FORCE_INLINE __m256 simd_cast<__m256>(float A) { return _mm256_set1_ps(A); } 139 | template<> FORCE_INLINE __m256 simd_cast<__m256>(__m256i A) { return _mm256_castsi256_ps(A); } 140 | template<> FORCE_INLINE __m256 simd_cast<__m256>(__m256 A) { return A; } 141 | template<> FORCE_INLINE __m256i simd_cast<__m256i>(int A) { return _mm256_set1_epi32(A); } 142 | template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256 A) { return _mm256_castps_si256(A); } 143 | template<> FORCE_INLINE __m256i simd_cast<__m256i>(__m256i A) { return A; } 144 | 145 | #define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \ 146 | FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \ 147 | union accessor { simd_type m_native; base_type m_array[elements]; }; \ 148 | is_const accessor *acs = reinterpret_cast(&a); \ 149 | return acs->m_array; \ 150 | } 151 | 152 | MAKE_ACCESSOR(simd_f32, __m128, float, , 4) 153 | MAKE_ACCESSOR(simd_f32, __m128, float, const, 4) 154 | MAKE_ACCESSOR(simd_i32, __m128i, int, , 4) 155 | MAKE_ACCESSOR(simd_i32, __m128i, int, const, 4) 156 | 157 | MAKE_ACCESSOR(simd_f32, __m256, float, , 8) 158 | MAKE_ACCESSOR(simd_f32, __m256, float, const, 8) 159 | MAKE_ACCESSOR(simd_i32, __m256i, int, , 8) 160 | MAKE_ACCESSOR(simd_i32, __m256i, int, const, 8) 161 | 162 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 163 | // Specialized AVX input assembly function for general vertex gather 164 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 165 | 166 | typedef MaskedOcclusionCulling::VertexLayout VertexLayout; 167 | 168 | FORCE_INLINE void GatherVertices(__m256 *vtxX, __m256 *vtxY, __m256 *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes, const VertexLayout &vtxLayout) 169 | { 170 | assert(numLanes >= 1); 171 | 172 | const __m256i SIMD_TRI_IDX_OFFSET = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21); 173 | static const __m256i SIMD_LANE_MASK[9] = { 174 | _mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0), 175 | _mm256_setr_epi32(~0, 0, 0, 0, 0, 0, 0, 0), 176 | _mm256_setr_epi32(~0, ~0, 0, 0, 0, 0, 0, 0), 177 | _mm256_setr_epi32(~0, ~0, ~0, 0, 0, 0, 0, 0), 178 | _mm256_setr_epi32(~0, ~0, ~0, ~0, 0, 0, 0, 0), 179 | _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, 0, 0, 0), 180 | _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, 0, 0), 181 | _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, 0), 182 | _mm256_setr_epi32(~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0) 183 | }; 184 | 185 | // Compute per-lane index list offset that guards against out of bounds memory accesses 186 | __m256i safeTriIdxOffset = _mm256_and_si256(SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes]); 187 | 188 | // Fetch triangle indices. 189 | __m256i vtxIdx[3]; 190 | vtxIdx[0] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 0, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride)); 191 | vtxIdx[1] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 1, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride)); 192 | vtxIdx[2] = _mmw_mullo_epi32(_mm256_i32gather_epi32((const int*)inTrisPtr + 2, safeTriIdxOffset, 4), _mmw_set1_epi32(vtxLayout.mStride)); 193 | 194 | char *vPtr = (char *)inVtx; 195 | 196 | // Fetch triangle vertices 197 | for (int i = 0; i < 3; i++) 198 | { 199 | vtxX[i] = _mm256_i32gather_ps((float *)vPtr, vtxIdx[i], 1); 200 | vtxY[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetY), vtxIdx[i], 1); 201 | vtxW[i] = _mm256_i32gather_ps((float *)(vPtr + vtxLayout.mOffsetW), vtxIdx[i], 1); 202 | } 203 | } 204 | 205 | namespace MaskedOcclusionCullingAVX2 206 | { 207 | static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX2; 208 | 209 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 210 | // Include common algorithm implementation (general, SIMD independent code) 211 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 212 | 213 | #include "MaskedOcclusionCullingCommon.inl" 214 | 215 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 216 | // Utility function to create a new object using the allocator callbacks 217 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 218 | 219 | typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; 220 | typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; 221 | 222 | MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree) 223 | { 224 | MaskedOcclusionCullingPrivate *object = (MaskedOcclusionCullingPrivate *)alignedAlloc(64, sizeof(MaskedOcclusionCullingPrivate)); 225 | new (object) MaskedOcclusionCullingPrivate(alignedAlloc, alignedFree); 226 | return object; 227 | } 228 | }; 229 | 230 | #else 231 | 232 | namespace MaskedOcclusionCullingAVX2 233 | { 234 | typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; 235 | typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; 236 | 237 | MaskedOcclusionCulling *CreateMaskedOcclusionCulling(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree) 238 | { 239 | return nullptr; 240 | } 241 | }; 242 | 243 | #endif 244 | -------------------------------------------------------------------------------- /StaticLib/StaticLib.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 15.0 23 | {958E770F-16AB-468D-9DA8-C2DFF91824BB} 24 | Win32Proj 25 | StaticLib 26 | 10.0.16299.0 27 | 28 | 29 | 30 | StaticLibrary 31 | true 32 | v141 33 | Unicode 34 | 35 | 36 | StaticLibrary 37 | false 38 | v141 39 | true 40 | Unicode 41 | 42 | 43 | StaticLibrary 44 | true 45 | v141 46 | Unicode 47 | 48 | 49 | StaticLibrary 50 | false 51 | v141 52 | true 53 | Unicode 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | true 75 | 76 | 77 | true 78 | 79 | 80 | false 81 | 82 | 83 | false 84 | 85 | 86 | 87 | NotUsing 88 | Level3 89 | Disabled 90 | true 91 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 92 | 93 | 94 | Console 95 | true 96 | 97 | 98 | 99 | 100 | NotUsing 101 | Level3 102 | Disabled 103 | true 104 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 105 | 106 | 107 | Console 108 | true 109 | 110 | 111 | 112 | 113 | NotUsing 114 | Level3 115 | MaxSpeed 116 | true 117 | true 118 | true 119 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 120 | 121 | 122 | Console 123 | true 124 | true 125 | true 126 | 127 | 128 | 129 | 130 | NotUsing 131 | Level3 132 | MaxSpeed 133 | true 134 | true 135 | true 136 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 137 | 138 | 139 | Console 140 | true 141 | true 142 | true 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | AdvancedVectorExtensions2 156 | AdvancedVectorExtensions2 157 | AdvancedVectorExtensions2 158 | AdvancedVectorExtensions2 159 | 160 | 161 | AdvancedVectorExtensions2 162 | AdvancedVectorExtensions2 163 | AdvancedVectorExtensions2 164 | AdvancedVectorExtensions2 165 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 166 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 167 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 168 | -mavx512f -mavx512bw -mavx512dq %(AdditionalOptions) 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 |  2 | Apache License 3 | Version 2.0, January 2004 4 | 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, and 12 | distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 15 | owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all other entities 18 | that control, are controlled by, or are under common control with that entity. 19 | For the purposes of this definition, "control" means (i) the power, direct or 20 | indirect, to cause the direction or management of such entity, whether by 21 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity exercising 25 | permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, including 28 | but not limited to software source code, documentation source, and configuration 29 | files. 30 | 31 | "Object" form shall mean any form resulting from mechanical transformation or 32 | translation of a Source form, including but not limited to compiled object code, 33 | generated documentation, and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or Object form, made 36 | available under the License, as indicated by a copyright notice that is included 37 | in or attached to the work (an example is provided in the Appendix below). 38 | 39 | "Derivative Works" shall mean any work, whether in Source or Object form, that 40 | is based on (or derived from) the Work and for which the editorial revisions, 41 | annotations, elaborations, or other modifications represent, as a whole, an 42 | original work of authorship. For the purposes of this License, Derivative Works 43 | shall not include works that remain separable from, or merely link (or bind by 44 | name) to the interfaces of, the Work and Derivative Works thereof. 45 | 46 | "Contribution" shall mean any work of authorship, including the original version 47 | of the Work and any modifications or additions to that Work or Derivative Works 48 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 49 | by the copyright owner or by an individual or Legal Entity authorized to submit 50 | on behalf of the copyright owner. For the purposes of this definition, 51 | "submitted" means any form of electronic, verbal, or written communication sent 52 | to the Licensor or its representatives, including but not limited to 53 | communication on electronic mailing lists, source code control systems, and 54 | issue tracking systems that are managed by, or on behalf of, the Licensor for 55 | the purpose of discussing and improving the Work, but excluding communication 56 | that is conspicuously marked or otherwise designated in writing by the copyright 57 | owner as "Not a Contribution." 58 | 59 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 60 | of whom a Contribution has been received by Licensor and subsequently 61 | incorporated within the Work. 62 | 63 | 2. Grant of Copyright License. Subject to the terms and conditions of this 64 | License, each Contributor hereby grants to You a perpetual, worldwide, 65 | non-exclusive, no-charge, royalty-free, irrevocable copyright license to 66 | reproduce, prepare Derivative Works of, publicly display, publicly perform, 67 | sublicense, and distribute the Work and such Derivative Works in Source or 68 | Object form. 69 | 70 | 3. Grant of Patent License. Subject to the terms and conditions of this License, 71 | each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, 72 | no-charge, royalty-free, irrevocable (except as stated in this section) patent 73 | license to make, have made, use, offer to sell, sell, import, and otherwise 74 | transfer the Work, where such license applies only to those patent claims 75 | licensable by such Contributor that are necessarily infringed by their 76 | Contribution(s) alone or by combination of their Contribution(s) with the Work 77 | to which such Contribution(s) was submitted. If You institute patent litigation 78 | against any entity (including a cross-claim or counterclaim in a lawsuit) 79 | alleging that the Work or a Contribution incorporated within the Work 80 | constitutes direct or contributory patent infringement, then any patent licenses 81 | granted to You under this License for that Work shall terminate as of the date 82 | such litigation is filed. 83 | 84 | 4. Redistribution. You may reproduce and distribute copies of the Work or 85 | Derivative Works thereof in any medium, with or without modifications, and in 86 | Source or Object form, provided that You meet the following conditions: 87 | You must give any other recipients of the Work or Derivative Works a copy of 88 | this License; and 89 | 90 | 91 | You must cause any modified files to carry prominent notices stating that You 92 | changed the files; and 93 | 94 | 95 | You must retain, in the Source form of any Derivative Works that You 96 | distribute, all copyright, patent, trademark, and attribution notices from the 97 | Source form of the Work, excluding those notices that do not pertain to any 98 | part of the Derivative Works; and 99 | 100 | 101 | If the Work includes a "NOTICE" text file as part of its distribution, then 102 | any Derivative Works that You distribute must include a readable copy of the 103 | attribution notices contained within such NOTICE file, excluding those notices 104 | that do not pertain to any part of the Derivative Works, in at least one of 105 | the following places: within a NOTICE text file distributed as part of the 106 | Derivative Works; within the Source form or documentation, if provided along 107 | with the Derivative Works; or, within a display generated by the Derivative 108 | Works, if and wherever such third-party notices normally appear. The contents 109 | of the NOTICE file are for informational purposes only and do not modify the 110 | License. You may add Your own attribution notices within Derivative Works that 111 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 112 | provided that such additional attribution notices cannot be construed as 113 | modifying the License. 114 | You may add Your own copyright statement to Your modifications and may provide 115 | additional or different license terms and conditions for use, reproduction, or 116 | distribution of Your modifications, or for any such Derivative Works as a whole, 117 | provided Your use, reproduction, and distribution of the Work otherwise complies 118 | with the conditions stated in this License. 119 | 120 | 5. Submission of Contributions. Unless You explicitly state otherwise, any 121 | Contribution intentionally submitted for inclusion in the Work by You to the 122 | Licensor shall be under the terms and conditions of this License, without any 123 | additional terms or conditions. Notwithstanding the above, nothing herein shall 124 | supersede or modify the terms of any separate license agreement you may have 125 | executed with Licensor regarding such Contributions. 126 | 127 | 6. Trademarks. This License does not grant permission to use the trade names, 128 | trademarks, service marks, or product names of the Licensor, except as required 129 | for reasonable and customary use in describing the origin of the Work and 130 | reproducing the content of the NOTICE file. 131 | 132 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 133 | writing, Licensor provides the Work (and each Contributor provides its 134 | Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 135 | KIND, either express or implied, including, without limitation, any warranties 136 | or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 137 | PARTICULAR PURPOSE. You are solely responsible for determining the 138 | appropriateness of using or redistributing the Work and assume any risks 139 | associated with Your exercise of permissions under this License. 140 | 141 | 8. Limitation of Liability. In no event and under no legal theory, whether in 142 | tort (including negligence), contract, or otherwise, unless required by 143 | applicable law (such as deliberate and grossly negligent acts) or agreed to in 144 | writing, shall any Contributor be liable to You for damages, including any 145 | direct, indirect, special, incidental, or consequential damages of any character 146 | arising as a result of this License or out of the use or inability to use the 147 | Work (including but not limited to damages for loss of goodwill, work stoppage, 148 | computer failure or malfunction, or any and all other commercial damages or 149 | losses), even if such Contributor has been advised of the possibility of such 150 | damages. 151 | 152 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or 153 | Derivative Works thereof, You may choose to offer, and charge a fee for, 154 | acceptance of support, warranty, indemnity, or other liability obligations 155 | and/or rights consistent with this License. However, in accepting such 156 | obligations, You may act only on Your own behalf and on Your sole 157 | responsibility, not on behalf of any other Contributor, and only if You agree to 158 | indemnify, defend, and hold each Contributor harmless for any liability incurred 159 | by, or claims asserted against, such Contributor by reason of your accepting any 160 | such warranty or additional liability. 161 | 162 | END OF TERMS AND CONDITIONS 163 | 164 | APPENDIX: How to apply the Apache License to your work 165 | 166 | To apply the Apache License to your work, attach the following boilerplate 167 | notice, with the fields enclosed by brackets "[]" replaced with your own 168 | identifying information. (Don't include the brackets!) The text should be 169 | enclosed in the appropriate comment syntax for the file format. We also 170 | recommend that a file or class name and description of purpose be included on 171 | the same "printed page" as the copyright notice for easier identification within 172 | third-party archives. 173 | 174 | Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, 175 | Version 2.0 (the "License"); you may not use this file except in compliance with 176 | the License. You may obtain a copy of the License at 177 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or 178 | agreed to in writing, software distributed under the License is distributed on 179 | an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 180 | or implied. See the License for the specific language governing permissions and 181 | limitations under the License. 182 | --------------------------------------------------------------------------------