├── LICENSE ├── README.md └── windows └── AMDCoreCount.cpp /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CPU core counts 2 | 3 | This self-contained code sample shows you how to correctly detect physical core and logical processor counts on AMD processors. The sample also makes recommendations about the default thread pool size you should create for game initialization and game execution, on AMD processors. 4 | 5 | That recommendation is guidance only. Always profile and fit the thread pool size appropriately for your game's uses, since overall processor performance in games is affected by many factors. 6 | 7 | ## How to build the sample on Windows 8 | 9 | - Clone this repo and open a 64-bit Visual Studio Developer Command Prompt inside the [`windows`](windows) subdirectory. 10 | - `cl.exe AMDCoreCount.cpp` 11 | - `cl.exe` will build the code and generate a file called `AMDCoreCount.exe` which you can then run. 12 | 13 | ## Support and suggestions 14 | 15 | If you spot a bug, need any guidance, or would like to see the samples evolve in a particular way, file an issue and we'll take a look! 16 | 17 | ## Changelog 18 | 19 | - v1.0 20 | - Initial release 21 | 22 | - v2.0 23 | - Removed the Windows XP compatibility. Windows 7 or greater is now required. 24 | - Added `RYZEN_CORES_THRESHOLD` to help document the thread pool sizing guidance code. 25 | - Show more information about the detected processor, including name and vendor. 26 | - More clarity that the thread pool sizes for game play and game initialization are guidance only: *always remember to profile!* 27 | 28 | These self-contained code samples show you how to correctly detect physical and logical processor core counts. 29 | 30 | ## Supported operating systems 31 | 32 | - [x] Windows 7 33 | - [x] Windows 10 34 | 35 | ## Support and suggestions 36 | 37 | If you spot a bug, or would like to see the samples evolve in a particular way, file an issue and we'll take a look! 38 | -------------------------------------------------------------------------------- /windows/AMDCoreCount.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. 3 | // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy 5 | // of this software and associated documentation files (the "Software"), to deal 6 | // in the Software without restriction, including without limitation the rights 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | // copies of the Software, and to permit persons to whom the Software is 9 | // furnished to do so, subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be included in 12 | // all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | // THE SOFTWARE. 21 | // 22 | 23 | // This advice is specific to AMD processors and is not general guidance for all processor manufacturers. 24 | // 25 | // GetLogicalProcessorInformationEx requires Win7 or later 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #pragma comment( lib, "PowrProf" ) 33 | 34 | #define AMD_BULLDOZER_FAMILY 0x15 35 | 36 | // Note that this structure definition was accidentally omitted from WinNT.h 37 | typedef struct _PROCESSOR_POWER_INFORMATION { 38 | ULONG Number; 39 | ULONG MaxMhz; 40 | ULONG CurrentMhz; 41 | ULONG MhzLimit; 42 | ULONG MaxIdleState; 43 | ULONG CurrentIdleState; 44 | } PROCESSOR_POWER_INFORMATION, * PPROCESSOR_POWER_INFORMATION; 45 | 46 | // getProcessorInfo() collects the following information about the CPU(s) in the system: 47 | // 48 | // groups - number of configured processor groups, usually 1 49 | // numaNodes - number of configured NUMA nodes, usually 1 50 | // cores - number of physical processor cores 51 | // logicals - number of logical process cores, usually 2 x cores on processors with symmetric multithreading (SMT) enabled 52 | // maxLlcSize - the size of the processor's last level cache, in bytes 53 | // maxEfficiencyClass - the relationship between this processor and any other in terms of efficiency, with higher values corresponding to lower relative efficiency 54 | // NOTE: maxEfficiencyClass is only non-zero on systems with a heterogeneous set of cores 55 | void getProcessorInfo(DWORD& groups, DWORD& numaNodes, DWORD& cores, DWORD& logicals, DWORD& maxLlcSize, BYTE& maxEfficiencyClass, BOOL forceSingleNumaNode = false) { 56 | // Consider all processors in the system with a fully set affinity mask 57 | GROUP_AFFINITY filterGroupAffinity = { static_cast(0xffffffffffffffff), 0 }; 58 | 59 | if (forceSingleNumaNode) { 60 | PROCESSOR_NUMBER ProcNum; 61 | USHORT FilterNodeNumber; 62 | GetThreadIdealProcessorEx(GetCurrentThread(), &ProcNum); 63 | GetNumaProcessorNodeEx(&ProcNum, &FilterNodeNumber); 64 | GetNumaNodeProcessorMaskEx(FilterNodeNumber, &filterGroupAffinity); 65 | } 66 | 67 | groups = numaNodes = cores = logicals = maxLlcSize = maxEfficiencyClass = 0; 68 | char* buffer = NULL; 69 | DWORD len = 0; 70 | 71 | if (FALSE == GetLogicalProcessorInformationEx(RelationAll, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer, &len)) { 72 | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 73 | buffer = (char*)malloc(len); 74 | 75 | if (GetLogicalProcessorInformationEx(RelationAll, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer, &len)) { 76 | const char* ptr = buffer; 77 | 78 | while (ptr < buffer + len) { 79 | PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pi = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)ptr; 80 | 81 | if (NULL == pi) { 82 | break; 83 | } 84 | 85 | if (pi->Relationship == RelationProcessorCore) { 86 | if (forceSingleNumaNode) { 87 | for (size_t g = 0; g < pi->Processor.GroupCount; ++g) { 88 | if (filterGroupAffinity.Group == pi->Processor.GroupMask[g].Group) { 89 | KAFFINITY intersection = filterGroupAffinity.Mask & pi->Processor.GroupMask[g].Mask; 90 | 91 | if (intersection > 0) { 92 | cores++; 93 | logicals += static_cast(__popcnt64(intersection)); 94 | } 95 | } 96 | } 97 | } 98 | else { 99 | cores++; 100 | 101 | for (size_t g = 0; g < pi->Processor.GroupCount; ++g) { 102 | logicals += static_cast(__popcnt64(pi->Processor.GroupMask[g].Mask)); 103 | } 104 | } 105 | 106 | if (pi->Processor.EfficiencyClass > maxEfficiencyClass) { 107 | maxEfficiencyClass = pi->Processor.EfficiencyClass; 108 | } 109 | } 110 | 111 | if (pi->Relationship == RelationNumaNode) { 112 | numaNodes++; 113 | } 114 | 115 | if (pi->Relationship == RelationGroup) { 116 | groups = pi->Group.ActiveGroupCount; 117 | } 118 | 119 | if (pi->Relationship == RelationCache) { 120 | if (pi->Cache.CacheSize > maxLlcSize) { 121 | maxLlcSize = pi->Cache.CacheSize; 122 | } 123 | } 124 | 125 | ptr += pi->Size; 126 | } 127 | } 128 | 129 | free(buffer); 130 | } 131 | } 132 | } 133 | 134 | // wrapper for getProcessorInfo() that only collects physical and logical core counts 135 | void getProcessorCount(DWORD& cores, DWORD& logicals) { 136 | DWORD groups, numaNodes, maxLlcSize; 137 | BYTE maxEfficiencyClass; 138 | getProcessorInfo(groups, numaNodes, cores, logicals, maxLlcSize, maxEfficiencyClass); 139 | } 140 | 141 | // Get the processor name string via cpuid instruction intrinsic 142 | // 143 | // name - processor name is a null-terminated byte string of length 49 including the null character 144 | const char* getCpuidName(char* name) { 145 | name[0] = 0; 146 | int data[4]; 147 | 148 | __cpuid(data, 0x80000000); 149 | 150 | if (data[0] >= 0x80000004) { 151 | for (unsigned long long i = 0; i < 3; ++i) { 152 | __cpuid(data, (int)(0x80000002 + i)); 153 | *reinterpret_cast(name + 0 + 16 * i) = data[0]; 154 | *reinterpret_cast(name + 4 + 16 * i) = data[1]; 155 | *reinterpret_cast(name + 8 + 16 * i) = data[2]; 156 | *reinterpret_cast(name + 12 + 16 * i) = data[3]; 157 | } 158 | name[48] = 0; 159 | } 160 | 161 | return name; 162 | } 163 | 164 | // Get the processor vendor name string via cpuid instruction intrinsic 165 | // 166 | // vendor - processor vendor is a null-terminated byte string of length 13 including the null character 167 | const char* getCpuidVendor(char* vendor) { 168 | int data[4]; 169 | __cpuid(data, 0); 170 | 171 | *reinterpret_cast(vendor) = data[1]; 172 | *reinterpret_cast(vendor + 4) = data[3]; 173 | *reinterpret_cast(vendor + 8) = data[2]; 174 | vendor[12] = 0; 175 | 176 | return vendor; 177 | } 178 | 179 | // Get the processor family via cpuid instruction intrinsic 180 | int getCpuidFamily() { 181 | int data[4]; 182 | __cpuid(data, 1); 183 | 184 | int family = ((data[0] >> 8) & 0x0F); 185 | int extendedFamily = (data[0] >> 20) & 0xFF; 186 | 187 | int displayFamily = (family != 0x0F) ? family : (extendedFamily + family); 188 | 189 | return displayFamily; 190 | } 191 | 192 | // ################################################################################################################## 193 | // ### This advice is specific only to AMD processors and is NOT general guidance for all processor manufacturers ### 194 | // ### ### 195 | // ### Remember to profile! ### 196 | // ################################################################################################################## 197 | 198 | // Return a recommended number of hardware threads to use for running your game, taking into account processor family and configuration 199 | // For Ryzen processors with a number of physical cores below the configured threshold, logical processor cores are added to the recommended thread count 200 | #define RYZEN_CORES_THRESHOLD 8 201 | 202 | DWORD getRecommendedThreadCountForGameplay(BOOL forceSingleNumaNode = false, BOOL forceSMT = false, DWORD maxThreadPoolSize = MAXUINT32, DWORD forceThreadPoolSize = 0) { 203 | DWORD groups, numaNodes, cores, logicals, maxLlcSize; 204 | BYTE maxEfficiencyClass; 205 | 206 | getProcessorInfo(groups, numaNodes, cores, logicals, maxLlcSize, maxEfficiencyClass, forceSingleNumaNode); 207 | DWORD count = logicals; 208 | 209 | char vendor[13]; 210 | getCpuidVendor(vendor); 211 | 212 | if (0 == strcmp(vendor, "AuthenticAMD")) { 213 | if (AMD_BULLDOZER_FAMILY == getCpuidFamily()) { 214 | // Use the reported logical processor count on AMD "Bulldozer" family microarchitecture processors 215 | count = logicals; 216 | } 217 | else { 218 | // Use the physical core count, unless the number of physical cores is lower than the defined threshold 219 | count = (cores >= RYZEN_CORES_THRESHOLD) ? cores : logicals; 220 | } 221 | } 222 | 223 | // take into account SMT when calculating thread count 224 | if (forceSMT) { 225 | count = logicals; 226 | } 227 | 228 | // clamp the thread count to at most the size of maxThreadPoolSize 229 | if (maxThreadPoolSize > 0) { 230 | count = min(count, maxThreadPoolSize); 231 | } 232 | 233 | // force a particular thread count 234 | if (forceThreadPoolSize) { 235 | count = forceThreadPoolSize; 236 | } 237 | 238 | // always return at least 1 just in case count is 0 by the time we get here 239 | return max(1, count); 240 | } 241 | 242 | // ################################################################################################################## 243 | // ### This advice is specific only to AMD processors and is NOT general guidance for all processor manufacturers ### 244 | // ### ### 245 | // ### Remember to profile! ### 246 | // ################################################################################################################## 247 | 248 | // Return a recommended number of hardware threads to use for initialising your game, taking into account processor family and configuration 249 | DWORD getRecommendedThreadCountForGameInit(BOOL forceSingleNumaNode = false, BOOL forceSMT = false, DWORD maxThreadPoolSize = MAXUINT32, DWORD forceThreadPoolSize = 0) { 250 | DWORD groups, numaNodes, cores, logicals, maxLlcSize; 251 | BYTE maxEfficiencyClass; 252 | 253 | getProcessorInfo(groups, numaNodes, cores, logicals, maxLlcSize, maxEfficiencyClass, forceSingleNumaNode); 254 | DWORD count = logicals; 255 | 256 | // take into account SMT when calculating thread count 257 | if (forceSMT) { 258 | count = logicals; 259 | } 260 | 261 | // clamp the thread count to at most the size of maxThreadPoolSize 262 | if (maxThreadPoolSize > 0) { 263 | count = min(count, maxThreadPoolSize); 264 | } 265 | 266 | // force a particular thread count 267 | if (forceThreadPoolSize) { 268 | count = forceThreadPoolSize; 269 | } 270 | 271 | // always return at least 1 just in case count is 0 by the time we get here 272 | return max(1, count); 273 | } 274 | 275 | // Print all of the processor information 276 | void printProcessorInfo() { 277 | char name[49]; 278 | getCpuidName(name); 279 | 280 | char vendor[13]; 281 | getCpuidVendor(vendor); 282 | 283 | DWORD groups, numaNodes, cores, logicals, maxLlcSize; 284 | BYTE maxEfficiencyClass; 285 | getProcessorInfo(groups, numaNodes, cores, logicals, maxLlcSize, maxEfficiencyClass); 286 | 287 | int processorFamily = getCpuidFamily(); 288 | 289 | wprintf(L"Processor Name: %hs\n", name); 290 | wprintf(L"Processor Vendor: %hs\n", vendor); 291 | wprintf(L"Processor Family: 0x%x\n", processorFamily); 292 | wprintf(L"Processor Group Count: %lu\n", groups); 293 | wprintf(L"NUMA Node Count: %lu\n", numaNodes); 294 | 295 | if ((0 == strcmp(vendor, "AuthenticAMD")) && (AMD_BULLDOZER_FAMILY == processorFamily)) { 296 | // Print module count for AMD "Bulldozer" family microarchitecture processors 297 | wprintf(L"Processor Module Count: %lu\n", logicals / 2); 298 | wprintf(L"Processor Core Count: %lu\n", logicals); 299 | } 300 | else { 301 | wprintf(L"Processor Core Count: %lu\n", cores); 302 | } 303 | 304 | wprintf(L"Logical Processor Count: %lu\n", logicals); 305 | wprintf(L"Max Last Level Cache Size: %lu Bytes\n", maxLlcSize); 306 | wprintf(L"Max Processor Efficiency Class: 0x%#02x\n", maxEfficiencyClass); // See the getProcessorInfo() documentation for more information on processor efficiency classes 307 | 308 | // If available, print the value of PROCESSOR_POWER_INFORMATION.MaxMhz provided by CallNtPowerInformation(ProcessorInformation) 309 | { 310 | SYSTEM_INFO info; 311 | GetSystemInfo(&info); 312 | void* buffer = malloc(sizeof(PROCESSOR_POWER_INFORMATION) * info.dwNumberOfProcessors); 313 | 314 | if (buffer && 0 == CallNtPowerInformation(ProcessorInformation, NULL, 0, buffer, sizeof(PROCESSOR_POWER_INFORMATION) * info.dwNumberOfProcessors)) { 315 | PROCESSOR_POWER_INFORMATION pi = ((PROCESSOR_POWER_INFORMATION*)buffer)[0]; 316 | wprintf(L"MaxMhz: %lu MHz\n", pi.MaxMhz); // This is typically the processor's base clock 317 | } 318 | 319 | if (buffer) { 320 | free(buffer); 321 | } 322 | } 323 | } 324 | 325 | int main(int argc, char* argv[]) { 326 | wprintf(L"%hs [forceSingleNumaNode] [forceSMT] [maxThreadPoolSize] [forceThreadPoolSize]\n", argv[0]); 327 | 328 | printProcessorInfo(); 329 | 330 | BOOL forceSingleNumaNode = (argc > 1) ? atoi(argv[1]) : 0; 331 | BOOL forceSMT = (argc > 2) ? atoi(argv[2]) : 0; 332 | DWORD maxThreadPoolSize = (argc > 3) ? strtoul(argv[3], NULL, 0) : 0U; 333 | DWORD forceThreadPoolSize = (argc > 4) ? strtoul(argv[4], NULL, 0) : 0U; 334 | DWORD initThreads = getRecommendedThreadCountForGameInit(forceSingleNumaNode, forceSMT, maxThreadPoolSize, forceThreadPoolSize); 335 | DWORD playThreads = getRecommendedThreadCountForGameplay(forceSingleNumaNode, forceSMT, maxThreadPoolSize, forceThreadPoolSize); 336 | 337 | wprintf(L"forceSingleNumaNode: %i, forceSMT: %i, maxThreadPoolSize: %lu, forceThreadPoolSize: %lu\n", forceSingleNumaNode, forceSMT, maxThreadPoolSize, forceThreadPoolSize); 338 | wprintf(L"AMD Recommended Game Init Thread Count: %lu\n", initThreads); 339 | wprintf(L"AMD Recommended Game Play Thread Count: %lu\n", playThreads); 340 | 341 | return 0; 342 | } 343 | --------------------------------------------------------------------------------