├── .gitignore ├── LICENSE ├── MSR_ArchPerfMon_v3.h ├── MSR_Architectural.h ├── MSR_defs.h ├── Makefile ├── README.md ├── SF_test_offsets.c ├── SKX_IMC_BusDeviceFunctionOffset.h ├── SetupCoreCounters.sh ├── SnoopFilterMapper.c ├── low_overhead_timers.c ├── low_overhead_timers.h ├── run_ensemble.sh ├── ssum.c └── va2pa_lib.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.s 3 | *.optrpt 4 | *.exe 5 | log.* 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, John D McCalpin and the University of Texas at Austin 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MSR_ArchPerfMon_v3.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------- 2 | // Intel Arch SW Developer's Manual, Volume 3, document 325384-060, September 2016 3 | //------------------------------------------------- 4 | // Part 1: Architectural performance monitoring version 3, Volume 3B, section 18.2 5 | // and Section 35.1 6 | //------------------------------------------------- 7 | #define IA32_PMC0 0xC1L 8 | #define IA32_PMC1 0xC2L 9 | #define IA32_PMC2 0xC3L 10 | #define IA32_PMC3 0xC4L 11 | #define IA32_PMC4 0xC5L 12 | #define IA32_PMC5 0xC6L 13 | #define IA32_PMC6 0xC7L 14 | #define IA32_PMC7 0xC8L 15 | #define IA32_PERFEVTSEL0 0x186L 16 | #define IA32_PERFEVTSEL1 0x187L 17 | #define IA32_PERFEVTSEL2 0x188L 18 | #define IA32_PERFEVTSEL3 0x189L 19 | #define IA32_PERFEVTSEL4 0x18AL 20 | #define IA32_PERFEVTSEL5 0x18BL 21 | #define IA32_PERFEVTSEL6 0x18CL 22 | #define IA32_PERFEVTSEL7 0x18DL 23 | #define IA32_PERF_STATUS 0x198L 24 | #define IA32_THERM_STATUS 0x19CL 25 | #define IA32_PERF_CTL 0x199L 26 | #define IA32_MISC_ENABLE 0x1A0L 27 | #define IA32_FIXED_CTR0 0x309L 28 | #define IA32_FIXED_CTR1 0x30AL 29 | #define IA32_FIXED_CTR2 0x30BL 30 | #define IA32_FIXED_CTR_CTRL 0x38DL 31 | #define IA32_PERF_GLOBAL_STATUS 0x38EL 32 | #define IA32_PERF_GLOBAL_CTRL 0x38FL 33 | #define IA32_PERF_GLOBAL_OVF_CTRL 0x390L 34 | -------------------------------------------------------------------------------- /MSR_Architectural.h: -------------------------------------------------------------------------------- 1 | // ----------------------------------------------------------------- 2 | // Part 2: Performance-related MSRs from "Architectural MSRs" 3 | // (Volume 3B, Table 35-2) excludes those listed above in 4 | // "Architectural Performance Monitoring" 5 | // 6 | // Name, MSR_Address 7 | #define IA32_TIME_STAMP_COUNTER 0x10L 8 | #define IA32_MPERF 0xE7L 9 | #define IA32_APERF 0xE8L 10 | #define IA32_CLOCK_MODULATION 0x19AL 11 | #define IA32_ENERGY_PERF_BIAS 0x1B0L 12 | #define IA32_PACKAGE_THERM_STATUS 0x1B1L 13 | #define IA32_DEBUGCTL 0x1D9L 14 | #define IA32_PLATFORM_DCA_CAP 0x1F8L 15 | #define IA32_CPU_DCA_CAP 0x1F9L 16 | #define IA32_DCA_0_CAP 0x1FAL 17 | #define IA32_PERF_CAPABILITIES 0x345L 18 | #define IA32_PEBS_ENABLE 0x3F1L 19 | #define IA32_A_PMC0 0x4C1L 20 | #define IA32_A_PMC1 0x4C2L 21 | #define IA32_A_PMC2 0x4C2L 22 | #define IA32_A_PMC3 0x4C3L 23 | #define IA32_A_PMC4 0x4C4L 24 | #define IA32_A_PMC5 0x4C5L 25 | #define IA32_A_PMC6 0x4C6L 26 | #define IA32_A_PMC7 0x4C7L 27 | #define IA32_TSC_AUX 0xC0000203L 28 | -------------------------------------------------------------------------------- /MSR_defs.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------- 2 | // MSR list for performance monitoring utility for Xeon E5 v3 (Haswell EP, 06_3F, Hikari/Wrangler/Lonestar5) 3 | // Revision 0.2, 2017-03-02 4 | // John D. McCalpin, mccalpin@tacc.utexas.edu 5 | //------------------------------------------------- 6 | // This is a shortened version of Xeon_E5_v3_Perf_MSRs.txt that just includes 7 | // the MSR names and numbers in cpp #define format... 8 | // Since I use the MSR numbers primarily in "pread()" and "pwrite()" calls, 9 | // I will define all of these as signed long (64-bit) integers. 10 | // https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html 11 | //------------------------------------------------- 12 | // Intel Arch SW Developer's Manual, Volume 3, document 325384-060, September 2016 13 | //------------------------------------------------- 14 | // Part 1: Architectural performance monitoring version 3, Volume 3B, section 18.2 and Section 35.1 15 | //------------------------------------------------- 16 | #include "MSR_ArchPerfMon_v3.h" 17 | 18 | // ----------------------------------------------------------------- 19 | // Part 2: Performance-related MSRs from "Architectural MSRs" 20 | // (Volume 3B, Table 35-2) excludes those listed above in 21 | // "Architectural Performance Monitoring" 22 | // 23 | #include "MSR_Architectural.h" 24 | 25 | // ----------------------------------------------------------------- 26 | // Part 3: MSRs from Volume 3B, Tables 34-5 & 34-6 for Nehalem & Westmere, plus Table 34-8 for Xeon 5600 27 | // Skipped here.... 28 | 29 | // ----------------------------------------------------------------- 30 | // Part 4: MSRs from Volume 3B, Table 34-10, Sandy Bridge 06_2Ah and 06_2Dh 31 | // All Architectural MSRs are included (including Architectural Perf Monitoring, v3) 32 | // 06_2Dh is Xeon E5 (Stampede) 33 | // 06_2Ah is Xeon E3 (Scorpion) 34 | // Stampede: 35 | // DisplayFamily_DisplayModel 06_2Dh 36 | // Family 06, ExtendedFamily 00, Model 13, ExtendedModel 02 37 | // cpuinfo model 45 (decimal) 38 | 39 | // Update 2012-10-02: Downloaded new revision of Volume 3C (326019-044, August 2012) 40 | // and reviewed Sandy Bridge MSRs in Section 35.7, Table 35-11 41 | 42 | // Note: IA32_PERF_STATUS and MSR_PERF_STATUS are both 0x198, but the former only includes P-state, while the latter includes both P-state and voltage info 43 | // Note: All versions of Volume 3B screw up 0x1AD, calling it decimal 428, when it is actually 429. 44 | // Nehalem/Westmere have both 0x1AC and 0x1AD, while Sandy Bridge only defines one of the two 45 | 46 | // Name, MSR, Access, (~WriteMask), Notes 47 | 48 | // Same on Nehalem/Westmere as on Sandy Bridge 49 | #define MSR_PLATFORM_INFO 0xCEL 50 | #define MSR_PKG_CST_CONFIG_CONTROL 0xE2L 51 | #define MSR_TEMPERATURE_TARGET 0x1A2L 52 | #define MSR_MISC_FEATURE_CONTROL 0x1A4L 53 | #define MSR_OFFCORE_RSP_0 0x1A6L 54 | #define MSR_OFFCORE_RSP_1 0x1A7L 55 | #define MSR_MISC_PWR_MGMT 0x1AAL 56 | #define MSR_TURBO_POWER_CURRENT_LIMIT 0x1ACL 57 | #define MSR_TURBO_RATIO_LIMIT 0x1ADL 58 | #define MSR_POWER_CTL 0x1AFL 59 | #define MSR_PEBS_LD_LAT 0x3F6L 60 | #define MSR_PKG_C3_RESIDENCY 0x3F8L 61 | #define MSR_PKG_C6_RESIDENCY 0x3F9L 62 | #define MSR_PKG_C7_RESIDENCY 0x3FAL 63 | #define MSR_CORE_C3_RESIDENCY 0x3FCL 64 | #define MSR_CORE_C6_RESIDENCY 0x3FDL 65 | 66 | // Changes between Westmere and Sandy Bridge -- same register has additional info fields in SNB 67 | #define MSR_PERF_STATUS 0x198L 68 | 69 | // New features in Sandy Bridge 70 | #define MSR_CORE_C7_RESIDENCY 0x3FEL 71 | #define MSR_RAPL_POWER_UNIT 0x606L 72 | #define MSR_PKG_C2_RESIDENCY 0x60DL 73 | #define MSR_PKG_POWER_LIMIT 0x610L 74 | #define MSR_PKG_ENERGY_STATUS 0x611L 75 | #define MSR_PKG_POWER_INFO 0x614L 76 | #define MSR_PP0_POWER_LIMIT 0x638L 77 | #define MSR_PP0_ENERGY_STATUS 0x639L 78 | #define MSR_PP0_POLICY 0x63AL 79 | #define MSR_PP0_PERF_STATUS 0x63BL 80 | 81 | // ----------------------------------------------------------------- 82 | // Part 5: MSRs from August 2012 Volume 3C, Table 35-12, Extra MSRs for Sandy Bridge 06_2Ah 83 | // Skipped here.... 84 | 85 | // ----------------------------------------------------------------- 86 | // Part 6: MSRs from August 2012 Volume 3C, Table 35-12, Extra MSRs for Sandy Bridge 06_2Dh 87 | // All Architectural MSRs are included (including Architectural Perf Monitoring, v3) 88 | // 06_2Dh is Xeon E5 (Stampede) 89 | 90 | // Name, MSR, Access, (~WriteMask), Notes 91 | #define MSR_PKG_PERF_STATUS 0x613L 92 | #define MSR_RAPL_PERF_STATUS 0x614L 93 | #define MSR_DRAM_POWER_LIMIT 0x618L 94 | #define MSR_DRAM_ENERGY_STATUS 0x619L 95 | #define MSR_DRAM_PERF_STATUS 0x61BL 96 | #define MSR_DRAM_POWER_INFO 0x61CL 97 | 98 | #define U_MSR_PMON_FIXED_CTL 0x703L 99 | #define U_MSR_PMON_FIXED_CTR 0x704L 100 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = icc 2 | #CFLAGS = -DIMC_COUNTS -DCHA_COUNTS -DMAP_L3 -sox -g -O -xCORE-AVX512 3 | CFLAGS = -DIMC_COUNTS -DCHA_COUNTS -sox -g -O -xCORE-AVX512 4 | 5 | default: SnoopFilterMapper 6 | 7 | SnoopFilterMapper.o: SnoopFilterMapper.c 8 | icc $(CFLAGS) -qopenmp -c SnoopFilterMapper.c 9 | 10 | ssum.o: ssum.c 11 | icc -sox -g -O -xCORE-AVX512 -qopt-zmm-usage=high -c ssum.c 12 | 13 | SnoopFilterMapper: SnoopFilterMapper.o ssum.o va2pa_lib.o low_overhead_timers.c 14 | icc $(CFLAGS) -qopenmp SnoopFilterMapper.o ssum.o va2pa_lib.o -o SnoopFilterMapper 15 | 16 | SF_test_offsets.o: SF_test_offsets.c 17 | icc $(CFLAGS) -qopenmp -DRANDOMOFFSETS -DMYHUGEPAGE_1GB -c SF_test_offsets.c 18 | 19 | SF_test_offsets: SF_test_offsets.o ssum.o va2pa_lib.o low_overhead_timers.c 20 | icc $(CFLAGS) -qopenmp SF_test_offsets.o ssum.o va2pa_lib.o -o SF_test_offsets 21 | 22 | SnoopFilterMapper_THP.o: SnoopFilterMapper.c 23 | icc $(CFLAGS) -qopenmp -DMYHUGEPAGE_THP -c SnoopFilterMapper.c -o SnoopFilterMapper_THP.o 24 | 25 | SnoopFilterMapper_THP: SnoopFilterMapper_THP.o ssum.o va2pa_lib.o low_overhead_timers.c 26 | icc $(CFLAGS) -qopenmp SnoopFilterMapper_THP.o ssum.o va2pa_lib.o -o SnoopFilterMapper_THP 27 | 28 | clean: 29 | rm -f *.o 30 | rm -f SnoopFilterMapper SnoopFilterMapper_THP SF_test_offsets 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SKX\_SF\_Conflicts -- READ.ME file 2 | 3 | This is remarkably messy code, including way too much processor-specific customization, but it does provide a very compact way to demonstrate Snoop Filter Conflicts on Intel Skylake Xeon processors. 4 | 5 | There are two main programs here: 6 | 7 | 1. SnoopFilterMapper.c 8 | 1. SF\_Test\_Offsets.c 9 | 10 | These are very similar codes that are set up to repeatedly sum a contiguous, nominally 11 | L2-containable array, with extensive performance counter monitoring, to look for evidence 12 | of Snoop Filter Conflicts on Intel Xeon Scalable processors (a.k.a., Skylake Xeon, or Skylake Server). 13 | 14 | * **SnoopFilterMapper** is specialized for use with 2MiB large pages (either pre-allocated 15 | or Transparent Huge Pages). It is intended to be run hundreds or thousands of times 16 | (getting a different set of physical addresses each time), allowing post-processing 17 | of results to investigate the relationship between L2 miss rates (which should be near 18 | zero) and Snoop Filter Evictions. 19 | 20 | * **SF\_Test\_Offsets** is specialized for use with 1 GiB large pages (which must be pre-allocated 21 | at system boot time). It is intended to be run many times with different offsets 22 | provided on the command line. The offset determines the start of the contiguous, L2- 23 | containable array relative to the beginning of each of the 1 GiB pages used. 24 | In this case, ensembles of runs are not needed, since the largest L2-containable array 25 | size is much smaller than a 1 GiB page (1 MiB per core times 28-cores = 28 MiB), 26 | ensuring that the contiguous virtual address range used corresponds to a contiguous 27 | physical address range. 28 | 29 | ## SnoopFilterMapper 30 | This outline lists the major operations executed by the **SnoopFilterMapper** code 31 | 32 | 1. Allocates a 2GiB array 33 | * Options for 1GiB pages (#ifdef MYHUGEPAGE\_1GB) or pre-allocated 2MiB pages (default) or 2MiB Transparent Huge Pages (#ifdef MYHUGEPAGE_THP). 34 | 1. Initialize/instantiate full 2GiB array. 35 | 1. Grab the physical addresses of each page in the array (either 2 values or 2048 values) and save in an array. 36 | 1. Check CPUID to see if the processor model is correct. 37 | 1. Optionally prepare to use the IMC counters (#ifdef IMC\_COUNTS) 38 | 1. mmap /dev/mem and check for the correct SKX VID/DID for bus 0, device 5, function 0 39 | 1. Optionally set up the CHA counters (#ifdef CHA\_COUNTS) 40 | 1. open one /dev/cpu/\*/msr device in each socket 41 | 1. read and print the four programmable core performance counters for the core in socket 0 42 | 1. program the counters and filter in each of the CHAs (NUM\_CHA\_USED is hardcoded) 43 | 1. Optionally program the IMC counters (#ifdef IMC\_COUNTS) 44 | 1. Optionally determines the mapping of addresses to L3 numbers (#ifdef MAP\_L3) 45 | 1. Mostly written for 2MiB pages. 46 | 1. For the first PAGES\_MAPPED pages: 47 | 1. Check to see if mapping file already exists for the 2MiB page physical address 48 | 1. If exists, read the file 49 | 1. else, for each cache line 50 | 1. Read the L3 counts, access the line many times, read the L3 counts 51 | 1. Sanity check results -- if good, store L3 mapping, if bad, repeat. 52 | 1. After mapping all lines in the page, write the mapping file for later use. 53 | 1. After all pages are mapped, add up the number of lines mapped to each CHA. (Not needed once it is shown that short contiguous ranges cover all the CHAs almost-uniformly.) 54 | 1. Run an OpenMP parallel "warm-up" loop of AVX512 instructions to try to make sure the cores have spun up the AVX512 units and boosted the cores to the correct frequency. 55 | 1. Optionally read the initial values of the IMC counters (#ifdef IMC\_COUNTS) 56 | 1. Optionally read the initial values of the CHA counters (#ifdef CHA\_COUNTS) 57 | 1. Code Under Test 58 | 1. save start\_tsc() (in OpenMP master thread) 59 | 1. First OpenMP loop: 60 | 1. Check core number for each thread using RDTSCP TSC_AUX value 61 | 1. NOTE: implies KMP_AFFINITY="granularity=fine" 62 | 1. Read initial values of programmable core counters on each core used. 63 | 1. Second OpenMP loop: 64 | 1. read initial value of fixed-function counters on each core in use 65 | 1. Repeat call to ssum() "inner\_repetitions" times (with individualized array start/stop values per thread). 66 | 1. read final value of fixed-function counters on each core in use 67 | 1. NOTE: these counter reads are inside the OpenMP barrier, so they can be used to detect load imbalance. 68 | 1. Third OpenMP loop: 69 | 1. Check core number for each thread using RDTSCP TSC\_AUX value 70 | 1. Read final values of programmable core counters on each core used. 71 | 1. save end\_tsc() (in OpenMP master thread) 72 | 1. Optionally read the final values of the CHA counters (#ifdef CHA\_COUNTS) 73 | 1. Optionally read the final values of the IMC counters (#ifdef IMC\_COUNTS) 74 | 1. Post-Processing 75 | 1. Compute package sums of core counters 76 | 1. Optionally compute package sums of CHA counters 77 | 1. Optionally compute package sums of IMC counters 78 | 1. Compute utilization, average frequency, and IPC for each thread (inside the OpenMP barriers). 79 | 1. Compute snoop filter eviction rate (assumes SF EVICTS are in CHA counter 0) 80 | 81 | ## SF\_Test\_Offsets 82 | The **SF\_test\_offsets** code is very similar to **SnoopFilterMapper**, but is specialized to run the code under test using contiguous memory at various offsets from the base of each 1GiB page. 83 | 84 | * This is probably similar enough to merge with SnoopFilterMapper.c. 85 | * The code includes "#ifdef SIMPLE\_OMP\_LOOP" to run the reduction in scalar mode instead of using the external AVX512 ssum() routine. 86 | * The OpenMP scalar reduction mode loses the "between the barriers" fixed-function core counter data, but retains all the other performance counter data. 87 | 88 | ## Porting Notes 89 | 90 | 1. The main functionality of the codes is enabled/disabled through preprocessor variables 91 | * MAP\_L3 -- if defined, causes the code to use CHA counters to attempt to map each cache line in the contiguous array to one of the L3 cache slices. This code is complex and slow (about 6 seconds for the 32768 cache lines in a 2MiB page), and is not needed when using this code to look for snoop filter conflicts. 92 | * Requires CHA\_COUNTS to be defined, which requires root privileges for access to the /dev/cpu/\*/msr device drivers. 93 | * CHA\_COUNTS -- if defined, causes the code to program the hardware performance counters in each CHA to measure four specific events, and to read these counts before and after the code under test. 94 | * This must be defined to directly measure Snoop Filter Evictions. 95 | * Requires root privileges for access to the /dev/cpu/\*/msr device drivers. 96 | * The events are: 97 | 1. Snoop Filter Evictions: SF\_EVICTION.ALL (sum of M, E, S states) 98 | 2. L3 Data Read Lookups: LLC\_LOOKUP.DATA\_READ (requires CHA\_FILTER0[26:17]) 99 | 3. L2 Writebacks to L3: LLC\_LOOKUP.DATA\_WRITE (requires CHA\_FILTER0[26:17]) 100 | 4. L3 Writebacks to Memory: LLC\_VICTIMS.TOTAL (MESF) (does not count clean victims) 101 | * The CHA\_FILTER0 in each CHA is programmed to count all L3 lookups (hit or miss, any state), but not to count SF lookups. 102 | * IMC\_COUNTS -- if defined, causes the code to program the IMC counters in each DRAM channel to measure four events, and to read these counts before and after the code under test. 103 | * This is not required to measure snoop filter conflicts. 104 | * With the L2-contained array access kernel, these counters don't provide any useful information, but they are still in the code for historical reasons. 105 | * If this variable is defined, more portability checks include: 106 | * The file SKX\_IMC\_BusDeviceFunctionOffset.h contains (potentially) machine-specific bus numbers for the PCI configuration space used to access the memory controller performance counters that will need to be checked and/or updated. 107 | * The variable `mmconfig_base` is set to 0x80000000 in the main program. This value is used to map all of PCI configuration space to a local pointer for access to the memory controller performance counters. 108 | * The easiest way to find the correct value for your system is `grep MMCONFIG /proc/iomem` 109 | * The events are: 110 | 1. DRAM cache line reads: CAS\_COUNT.READS 111 | 2. DRAM cache line writes: CAS\_COUNT.WRITES 112 | 3. DRAM bank "activate" operations: ACT.ALL 113 | 4. DRAM bank "precharge" operations due to bank conflicts: PRE\_COUNT.MISS 114 | 2. There are a stupid number of machine-specific defines in the code: 115 | * The ARRAYSIZE is set to 2GiB by default. 116 | * Only a fraction of this is typically used for the contiguous, L2-resident array accesses, but the large size allows for two 1 GiB pages or 1024 2 MiB pages. These large sizes are useful when the code is being used to determine the mapping of physical addresses to L3 slices. 117 | * MYPAGESIZE is typically set to 2097152L for **SnoopFilterMapper** and to 1073741824UL for **SF\_Test\_offsets**. 118 | * PAGES\_MAPPED tells the code how many pages to look at when mapping physical addresses to L3 slices (assuming MAP\_L3 is defined). This is typically set to 1/2 the number of cores in use, so that the code will only be mapping the cache lines that are used in the contiguous L2-contained summation kernel. 119 | * NUM\_SOCKETS is set to two for measurement on 2-socket systems, but the typical use case only uses socket 0. 120 | * NUM\_IMC\_CHANNELS is set to 6, which is the correct number of channels per socket for all of the Xeon Scalable Processor models. This would need to be reduced to 4 for testing the "Xeon W-21xx" processors. 121 | * NUM\_CHA\_BOXES is set to 28. All Xeon Scalable Processors have MSR addresses for all 28 CHAs, even if some (or many) are disabled, so there should be no need to change this. 122 | * NUM\_CHA\_USED is set to 24, which is the number of "active" CHAs in the Xeon Platinum 8160 processor. This should be changed to the correct number of active L3 slices for other SKX processor models. 123 | * Inactive CHAs will return zero on all performance counts, so both the individual results and the socket-wide sums should be correct as long as NUM\_CHA\_USED is at least as large as the number of CHA/L3 slices actually active.) 124 | * MAXCORES is set to 96, which is the number of logical processors on a two-socket Xeon Platinum 8160 system with HyperThreading enabled (2 sockets * 24 physical cores/socket * 2 logical processors/physical core = 96). This is used for array sizing, so it only needs to be changed if the actual number of cores used is larger than this value. 125 | * CORES\_USED is set to 24, which is the number of physical cores in a Xeon Platinum 8160 processor. The code assumes that this variable matches the OpenMP thread count, with loop structures set up to execute one iteration per OpenMP thread. 126 | * This will need to be changed for testing other core counts. 127 | * The runtime environment must be consistent with the value used in the compilation!!! 128 | * More notes on the runtime environment are included below.... 129 | * RANDOM\_OFFSETS does not really mean what it says.... It is not used in **SnoopFilterMapper**, but is used in **SF\_Test\_offsets**. When defined, the code expects an integer argument on the command line. The argument is interpreted as the number of 64-bit array elements above the base of each 1 GiB page to start the contiguous, L2-containable array accesses. This ensures that the contiguous virtual address range used maps to a contiguous physical address range. 130 | * SIMPLE\_OMP\_LOOP in **SF\_Test\_offsets** switches the code under test from an AVX512-optimized external summation routine (ssum.c) to a simple OpenMP sum reduction. 131 | * With the AVX-512 code, the 512-bit loads ensure that each cache line is consumed by a single load operation, so there is no possibility that the cache line can be evicted from the cache while it is still in use. 132 | * With the simple OpenMP reduction, the Intel 18 compiler generates scalar code, so 8 load operations are required to process each cache line. In this case, it is possible for a line to be evicted (by a Snoop Filter Eviction) before it has been completely processed. When this happens, there will be more than one L2 miss associated with processing that cache line one time, and the overall L2 miss rate will increase. 133 | 3. Run time environment 134 | * I use the Intel OpenMP runtime environment variable `KMP_HW_SUBSET=1s,24c,1t` to limit the execution to 24 cores on 1 socket. 135 | * With this environment variable definition, there is no need to set `OMP_NUM_THREADS`. 136 | * The code assumes that threads are not allowed to migrate between the two thread contexts on each logical processor (when HyperThreading is enabled). This can be enforced by the `1t` option to KMP\_HW\_SUBSET, or by adding the `granularity=fine` to the KMP\_AFFINITY environment variable. 137 | * The code expects the fixed-function and programmable core performance counters to be enabled and configured correctly on each core that is used: 138 | * IA32\_PERF\_GLOBAL\_CTRL (MSR 0x38f) should be set to 0x70000000f to enable the three fixed-function counters and four programmable counters per core. (With HyperThreading disabled, setting this MSR to 0x7000000ff enables all eight programmable counters on each core). 139 | * IA32\_FIXED\_CTR\_CTRL (MSR 0x38d) should be set to 0x333 to enable the three fixed-function counters to count in both user and kernel mode. This may require disabling the NMI watchdog, which typically uses one of these counters. 140 | * The script "SetupCoreCounters.sh" sets the core counters to a useful set: 141 | 1. MEM\_INST\_RETIRED.ALL\_LOADS (0x004381d0) 142 | 2. L1D.REPLACEMENTS (0x00430151) 143 | 3. L2\_RQSTS.MISS (0x00433f24) 144 | 4. L2\_LINES\_IN.ALL (0x00431ff1) 145 | * If CHA\_COUNTS is defined, the code will print out the core performance counter event select registers on core 0, but does not check to see if the values are the expected ones. 146 | 147 | 148 | -------------------------------------------------------------------------------- /SF_test_offsets.c: -------------------------------------------------------------------------------- 1 | // John D. McCalpin, mccalpin@tacc.utexas.edu 2 | static char const rcsid[] = "$Id: SF_test_offsets.c,v 1.4 2018/05/17 22:20:24 mccalpin Exp mccalpin $"; 3 | 4 | // include files 5 | #include // printf, etc 6 | #include // standard integer types, e.g., uint32_t 7 | #include // for signal handler 8 | #include // exit() and EXIT_FAILURE 9 | #include // strerror() function converts errno to a text string for printing 10 | #include // for open() 11 | #include // errno support 12 | #include // assert() function 13 | #include // sysconf() function, sleep() function 14 | #include // support for mmap() function 15 | #include // required for 1GiB page support in mmap() 16 | #include // for pow() function used in RAPL computations 17 | #include 18 | #include // for gettimeofday 19 | 20 | # define ARRAYSIZE 2147483648L 21 | 22 | #ifdef MYHUGEPAGE_1GB 23 | // 1 GiB pages 24 | #define MYPAGESIZE 1073741824UL 25 | #define NUMPAGES 32L 26 | #define PAGES_MAPPED 32L // this code is not working correctly for 1GiB pages, but I already know the answers.... 27 | #else 28 | #define MYPAGESIZE 2097152L 29 | #define NUMPAGES 1024L 30 | #define PAGES_MAPPED 14L 31 | #endif 32 | 33 | #define SPECIAL_VALUE (-1) 34 | 35 | // interfaces for va2pa_lib.c 36 | void print_pagemap_entry(unsigned long long pagemap_entry); 37 | unsigned long long get_pagemap_entry( void * va ); 38 | 39 | int dumpall; // when set to 1, will cause dump of lots of stuff for debugging 40 | int report; 41 | int nwraps; // track number of performance counter wraps 42 | 43 | double *array; // array pointer to mmap on 1GiB pages 44 | double *page_pointers[NUMPAGES]; // one pointer for each page allocated 45 | uint64_t pageframenumber[NUMPAGES]; // one PFN entry for each page allocated 46 | 47 | // constant value defines 48 | # define NUM_SOCKETS 2 // 49 | # define NUM_IMC_CHANNELS 6 // includes channels on all IMCs in a socket 50 | # define NUM_IMC_COUNTERS 5 // 0-3 are the 4 programmable counters, 4 is the fixed-function DCLK counter 51 | # define NUM_CHA_BOXES 28 52 | # define NUM_CHA_USED 28 53 | # define NUM_CHA_COUNTERS 4 54 | 55 | long imc_counts[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][2]; // including the fixed-function (DCLK) counter as the final entry 56 | long imc_pkg_sums[NUM_SOCKETS][NUM_IMC_COUNTERS]; // sum across channels for each chip 57 | char imc_event_name[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][32]; // reserve 32 characters for the IMC event names for each socket, channel, counter 58 | uint32_t imc_perfevtsel[NUM_IMC_COUNTERS]; // expected control settings for the counters 59 | uint32_t imc_vid_did[3]; // PCIe configuration space vendor and device IDs for the IMC blocks 60 | long cha_counts[NUM_SOCKETS][NUM_CHA_BOXES][NUM_CHA_COUNTERS][2]; // 2 sockets, 28 tiles per socket, 4 counters per tile, 2 times (before and after) 61 | uint32_t cha_perfevtsel[NUM_CHA_COUNTERS]; 62 | long cha_pkg_sums[NUM_SOCKETS][NUM_CHA_COUNTERS]; 63 | 64 | #define MAXCORES 112 65 | #define CORES_USED 24 66 | // New feature -- core counters. 67 | // upgrade to include counters for all cores 68 | long core_counters[MAXCORES][4][2]; // 24 cores & 24 threads on one socket, 4 counters, before and after 69 | long fixed_counters[MAXCORES][4][2]; // 24 cores with 4 fixed-function core counters (Instr, CoreCyc, RefCyc, TSC) 70 | long core_pkg_sums[NUM_SOCKETS][4]; // four core counters 71 | long fixed_pkg_sums[NUM_SOCKETS][4]; // four fixed-function counters per core (Instr, CoreCyc, RefCyc, TSC) 72 | 73 | int8_t cha_by_page[PAGES_MAPPED][32768]; // L3 numbers for each of the 32,768 cache lines in each of the first PAGES_MAPPED 2MiB pages 74 | uint64_t paddr_by_page[PAGES_MAPPED]; // physical addresses of the base of each of the first PAGES_MAPPED 2MiB pages used 75 | long lines_by_cha[NUM_CHA_USED]; // bulk count of lines assigned to each CHA 76 | 77 | #ifdef DEBUG 78 | FILE *log_file; // log file for debugging -- should not be needed in production 79 | #endif 80 | unsigned int *mmconfig_ptr; // must be pointer to 32-bit int so compiler will generate 32-bit loads and stores 81 | 82 | struct timeval tp; // seconds and microseconds from gettimeofday 83 | struct timezone tzp; // required, but not used here. 84 | 85 | double ssum(double *a, long vl); 86 | 87 | double mysecond() 88 | { 89 | struct timeval tp; 90 | struct timezone tzp; 91 | int i; 92 | 93 | i = gettimeofday(&tp,&tzp); 94 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 95 | } 96 | 97 | # ifndef MIN 98 | # define MIN(x,y) ((x)<(y)?(x):(y)) 99 | # endif 100 | # ifndef MAX 101 | # define MAX(x,y) ((x)>(y)?(x):(y)) 102 | # endif 103 | 104 | 105 | #include "low_overhead_timers.c" 106 | 107 | 108 | #include "SKX_IMC_BusDeviceFunctionOffset.h" 109 | #include "MSR_defs.h" 110 | // =========================================================================================================================================================================== 111 | // Convert PCI(bus:device.function,offset) to uint32_t array index 112 | uint32_t PCI_cfg_index(unsigned int Bus, unsigned int Device, unsigned int Function, unsigned int Offset) 113 | { 114 | uint32_t byteaddress; 115 | uint32_t index; 116 | assert (Device >= 0); 117 | assert (Function >= 0); 118 | assert (Offset >= 0); 119 | assert (Device < (1<<5)); 120 | assert (Function < (1<<3)); 121 | assert (Offset < (1<<12)); 122 | byteaddress = (Bus<<20) | (Device<<15) | (Function<<12) | Offset; 123 | index = byteaddress / 4; 124 | return ( index ); 125 | } 126 | 127 | // =========================================================================================================================================================================== 128 | int main(int argc, char *argv[]) 129 | { 130 | // local declarations 131 | // int cpuid_return[4]; 132 | int i; 133 | int retries; 134 | int zeros; 135 | int rc; 136 | int core_pmc_width, fixed_pmc_width; // these will be looked up using CPUID to use in overflow/wraparound correction 137 | int uncore_pmc_width=48; // all the uncore stuff is model-dependent, but most are 48 bits 138 | ssize_t rc64; 139 | char description[100]; 140 | size_t len; 141 | long arraylen; 142 | long l2_contained_size, inner_repetitions; 143 | unsigned long pagemapentry; 144 | unsigned long paddr, basephysaddr; 145 | unsigned long pagenum, basepagenum; 146 | uint32_t bus, device, function, offset, ctl_offset, ctr_offset, value, index; 147 | uint32_t socket, imc, channel, counter, controller; 148 | long count,delta; 149 | long j,k,page_number,page_base_index,line_number; 150 | long jstart[CORES_USED], jend[CORES_USED], mycore, vl[CORES_USED]; 151 | uint32_t low_0, high_0, low_1, high_1; 152 | char filename[100]; 153 | int pkg, tile; 154 | int nr_cpus; 155 | uint64_t msr_val, msr_num; 156 | int mem_fd; 157 | int msr_fd[2]; // one for each socket 158 | int proc_in_pkg[2]; // one Logical Processor number for each socket 159 | uid_t my_uid; 160 | gid_t my_gid; 161 | double sum,expected; 162 | double t0, t1; 163 | double avg_cycles; 164 | unsigned long tsc_start, tsc_end; 165 | float TSC_GHz; 166 | double sf_evict_rate; 167 | double bandwidth; 168 | unsigned long mmconfig_base=0x80000000; // DOUBLE-CHECK THIS ON NEW SYSTEMS!!!!! grep MMCONFIG /proc/iomem | awk -F- '{print $1}' 169 | unsigned long mmconfig_size=0x10000000; 170 | double private_sum,partial_sums[CORES_USED]; 171 | long iters,iteration_counts[CORES_USED]; 172 | long BaseOffset; 173 | 174 | TSC_GHz = get_TSC_frequency()/1.0e9; 175 | core_pmc_width = get_core_counter_width(); 176 | fixed_pmc_width = get_fixed_counter_width(); 177 | 178 | BaseOffset = 0; 179 | #ifdef RANDOMOFFSETS 180 | if (argc != 2) { 181 | printf("Must Provide a Random Offset cache line offset value (an integer between 0 and 2^24-375000 (16,402,216))\n"); 182 | exit(1); 183 | } else { 184 | BaseOffset = atol(argv[1]); 185 | printf("Random Cache Line Offset is %ld\n",BaseOffset); 186 | BaseOffset = BaseOffset*8; 187 | printf("Starting index for summation is %ld\n",BaseOffset); 188 | } 189 | #endif 190 | 191 | retries = 0; 192 | zeros = 0; 193 | report = 1; 194 | dumpall = 0; 195 | nwraps = 0; 196 | l2_contained_size = 125000 * CORES_USED; // about 95% of the L2 space in the cores used 197 | // l2_contained_size = 87380 * CORES_USED; // with 24 cores, this gives almost exactly 16 MiB 198 | for (i=0; i 100) { 646 | printf("ERROR: No good results for line %d after %d tries\n",line_number,numtries); 647 | exit(101); 648 | } 649 | totaltries++; 650 | // 1. read L3 counters before starting test 651 | for (tile=0; tile95%) 690 | // goodness2 = min/NFLUSHES (pass if <20%) 691 | // goodness3 = avg/NFLUSHES (pass if <40%) 692 | max_count = 0; 693 | min_count = 1<<30; 694 | sum_count = 0; 695 | for (tile=0; tile 0.95 ) pass1 = 1; 709 | if ( goodness2 < 0.20 ) pass2 = 1; 710 | if ( goodness3 < 0.40 ) pass3 = 1; 711 | good_new = pass1 * pass2 * pass3; 712 | #ifdef VERBOSE 713 | printf("GOODNESS: line_number %ld max_count %d min_count %d sum_count %d avg_count %f goodness1 %f goodness2 %f goodness3 %f pass123 %d %d %d\n", 714 | line_number, max_count, min_count, sum_count, avg_count, goodness1, goodness2, goodness3, pass1, pass2, pass3); 715 | if (good_new == 0) printf("DEBUG: one or more of the sanity checks failed for line=%ld: %d %d %d goodness values %f %f %f\n", 716 | line_number,pass1,pass2,pass3,goodness1,goodness2,goodness3); 717 | #endif 718 | 719 | // test to see if more than one CHA reports > 0.95*NFLUSHES events 720 | found = 0; 721 | old_cha = -1; 722 | int min_counts = (NFLUSHES*19)/20; 723 | for (tile=0; tile= min_counts) { 725 | old_cha = cha_by_page[page_number][line_number]; 726 | cha_by_page[page_number][line_number] = tile; 727 | found++; 728 | #ifdef VERBOSE 729 | if (found > 1) { 730 | printf("WARNING: Multiple (%d) CHAs found using counter 1 for cache line %ld, index %ld: old_cha %d new_cha %d\n",found,line_number,page_base_index+line_number*8,old_cha,cha_by_page[page_number][line_number]); 731 | } 732 | #endif 733 | } 734 | } 735 | if (found == 0) { 736 | good_old = 0; 737 | #ifdef VERBOSE 738 | printf("WARNING: no CHA entry has been found for line %ld!\n",line_number); 739 | printf("DEBUG dump for no CHA found\n"); 740 | for (tile=0; tile 0) { 835 | for (i=0; i low_0)) 893 | // (this indicates that the counter rolled between the 3rd and 4th reads). 894 | low_0 = mmconfig_ptr[index]; 895 | high_0 = mmconfig_ptr[index+1]; 896 | 897 | low_1 = mmconfig_ptr[index]; 898 | high_1 = mmconfig_ptr[index+1]; 899 | 900 | if ( (high_1 != high_0) && (low_1 > low_0) ) { 901 | count = ((uint64_t) high_0) << 32 | (uint64_t) low_0; 902 | } else { 903 | count = ((uint64_t) high_1) << 32 | (uint64_t) low_1; 904 | } 905 | imc_counts[socket][channel][counter][0] = count; 906 | } 907 | } 908 | } 909 | #if 0 910 | // for debugging only: print initial values of IMC counts 911 | for (socket=0; socket low_0)) 1074 | // (this indicates that the counter rolled between the 3rd and 4th reads). 1075 | low_0 = mmconfig_ptr[index]; 1076 | high_0 = mmconfig_ptr[index+1]; 1077 | 1078 | low_1 = mmconfig_ptr[index]; 1079 | high_1 = mmconfig_ptr[index+1]; 1080 | 1081 | if ( (high_1 != high_0) && (low_1 > low_0) ) { 1082 | count = ((uint64_t) high_0) << 32 | (uint64_t) low_0; 1083 | } else { 1084 | count = ((uint64_t) high_1) << 32 | (uint64_t) low_1; 1085 | } 1086 | imc_counts[socket][channel][counter][1] = count; 1087 | } 1088 | } 1089 | } 1090 | #endif 1091 | // ================================== END OF PERFORMANCE COUNTER READS AFTER TEST ============================================== 1092 | 1093 | t0 = 0.0; 1094 | t1 = (double) (tsc_end - tsc_start) / TSC_GHz / 1.0e9; 1095 | printf("Instrumented code required %f seconds to execute\n",t1-t0); 1096 | bandwidth = sizeof(double)*(double)l2_contained_size*(double)inner_repetitions / (t1-t0) / 1e9; 1097 | printf("Bandwidth %f GB/s\n",bandwidth); 1098 | printf("Bandwidth per core %f GB/s\n",bandwidth/(double)CORES_USED); 1099 | printf("Approx Bytes/cycle per core %f\n",bandwidth/(double)CORES_USED/2.0); 1100 | 1101 | expected = (double)l2_contained_size * (double)(inner_repetitions) / (double)CORES_USED; 1102 | avg_cycles = (double)(tsc_end - tsc_start) / expected; 1103 | printf("Average TSC cycles per element %f\n",avg_cycles); 1104 | 1105 | // clear the arrays for the package-level sums 1106 | for (pkg=0; pkg<2; pkg++) { 1107 | for (counter=0; counter<4; counter++) { // no point in summing the cycle counts, so exclude counter 4 1108 | core_pkg_sums[pkg][counter] = 0; 1109 | fixed_pkg_sums[pkg][counter] = 0; 1110 | imc_pkg_sums[pkg][counter] = 0; 1111 | cha_pkg_sums[pkg][counter] = 0; 1112 | } 1113 | } 1114 | 1115 | // compute core package sums and optional print 1116 | for (i=0; i // printf, etc 6 | #include // standard integer types, e.g., uint32_t 7 | #include // for signal handler 8 | #include // exit() and EXIT_FAILURE 9 | #include // strerror() function converts errno to a text string for printing 10 | #include // for open() 11 | #include // errno support 12 | #include // assert() function 13 | #include // sysconf() function, sleep() function 14 | #include // support for mmap() function 15 | #include // required for 1GiB page support in mmap() 16 | #include // for pow() function used in RAPL computations 17 | #include 18 | #include // for gettimeofday 19 | 20 | # define ARRAYSIZE 2147483648L 21 | 22 | // MYHUGEPAGE_1GB overrides default of 2MiB for hugepages 23 | #if defined MYHUGEPAGE_1GB 24 | #define MYPAGESIZE 1073741824UL 25 | #define NUMPAGES 2L 26 | #define PAGES_MAPPED 2L // this is still specifying how many 2MiB pages to map 27 | #else 28 | #define MYPAGESIZE 2097152L 29 | #define NUMPAGES 1024L 30 | #define PAGES_MAPPED 14L 31 | #endif 32 | 33 | 34 | #define SPECIAL_VALUE (-1) 35 | 36 | // interfaces for va2pa_lib.c 37 | void print_pagemap_entry(unsigned long long pagemap_entry); 38 | unsigned long long get_pagemap_entry( void * va ); 39 | 40 | int dumpall; // when set to 1, will cause dump of lots of stuff for debugging 41 | int report; 42 | int nwraps; // track number of performance counter wraps 43 | 44 | double *array; // array pointer to mmap on 1GiB pages 45 | double *page_pointers[NUMPAGES]; // one pointer for each page allocated 46 | uint64_t pageframenumber[NUMPAGES]; // one PFN entry for each page allocated 47 | 48 | // constant value defines 49 | # define NUM_SOCKETS 2 // 50 | # define NUM_IMC_CHANNELS 6 // includes channels on all IMCs in a socket 51 | # define NUM_IMC_COUNTERS 5 // 0-3 are the 4 programmable counters, 4 is the fixed-function DCLK counter 52 | # define NUM_CHA_BOXES 28 53 | # define NUM_CHA_USED 28 54 | # define NUM_CHA_COUNTERS 4 55 | 56 | long imc_counts[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][2]; // including the fixed-function (DCLK) counter as the final entry 57 | long imc_pkg_sums[NUM_SOCKETS][NUM_IMC_COUNTERS]; // sum across channels for each chip 58 | char imc_event_name[NUM_SOCKETS][NUM_IMC_CHANNELS][NUM_IMC_COUNTERS][32]; // reserve 32 characters for the IMC event names for each socket, channel, counter 59 | uint32_t imc_perfevtsel[NUM_IMC_COUNTERS]; // expected control settings for the counters 60 | uint32_t imc_vid_did[3]; // PCIe configuration space vendor and device IDs for the IMC blocks 61 | long cha_counts[NUM_SOCKETS][NUM_CHA_BOXES][NUM_CHA_COUNTERS][2]; // 2 sockets, 28 tiles per socket, 4 counters per tile, 2 times (before and after) 62 | uint32_t cha_perfevtsel[NUM_CHA_COUNTERS]; 63 | long cha_pkg_sums[NUM_SOCKETS][NUM_CHA_COUNTERS]; 64 | 65 | #define MAXCORES 112 66 | #define CORES_USED 24 67 | // New feature -- core counters. 68 | // upgrade to include counters for all cores 69 | long core_counters[MAXCORES][4][2]; // 24 cores & 24 threads on one socket, 4 counters, before and after 70 | long fixed_counters[MAXCORES][4][2]; // 24 cores with 4 fixed-function core counters (Instr, CoreCyc, RefCyc, TSC) 71 | long core_pkg_sums[NUM_SOCKETS][4]; // four core counters 72 | long fixed_pkg_sums[NUM_SOCKETS][4]; // four fixed-function counters per core (Instr, CoreCyc, RefCyc, TSC) 73 | 74 | int8_t cha_by_page[PAGES_MAPPED][32768]; // L3 numbers for each of the 32,768 cache lines in each of the first PAGES_MAPPED 2MiB pages 75 | uint64_t paddr_by_page[PAGES_MAPPED]; // physical addresses of the base of each of the first PAGES_MAPPED 2MiB pages used 76 | long lines_by_cha[NUM_CHA_USED]; // bulk count of lines assigned to each CHA 77 | 78 | #ifdef DEBUG 79 | FILE *log_file; // log file for debugging -- should not be needed in production 80 | #endif 81 | unsigned int *mmconfig_ptr; // must be pointer to 32-bit int so compiler will generate 32-bit loads and stores 82 | 83 | struct timeval tp; // seconds and microseconds from gettimeofday 84 | struct timezone tzp; // required, but not used here. 85 | 86 | double ssum(double *a, long vl); 87 | 88 | double mysecond() 89 | { 90 | struct timeval tp; 91 | struct timezone tzp; 92 | int i; 93 | 94 | i = gettimeofday(&tp,&tzp); 95 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 96 | } 97 | 98 | # ifndef MIN 99 | # define MIN(x,y) ((x)<(y)?(x):(y)) 100 | # endif 101 | # ifndef MAX 102 | # define MAX(x,y) ((x)>(y)?(x):(y)) 103 | # endif 104 | 105 | 106 | #include "low_overhead_timers.c" 107 | 108 | 109 | #include "SKX_IMC_BusDeviceFunctionOffset.h" 110 | #include "MSR_defs.h" 111 | // =========================================================================================================================================================================== 112 | // Convert PCI(bus:device.function,offset) to uint32_t array index 113 | uint32_t PCI_cfg_index(unsigned int Bus, unsigned int Device, unsigned int Function, unsigned int Offset) 114 | { 115 | uint32_t byteaddress; 116 | uint32_t index; 117 | assert (Device >= 0); 118 | assert (Function >= 0); 119 | assert (Offset >= 0); 120 | assert (Device < (1<<5)); 121 | assert (Function < (1<<3)); 122 | assert (Offset < (1<<12)); 123 | byteaddress = (Bus<<20) | (Device<<15) | (Function<<12) | Offset; 124 | index = byteaddress / 4; 125 | return ( index ); 126 | } 127 | 128 | // =========================================================================================================================================================================== 129 | int main(int argc, char *argv[]) 130 | { 131 | // local declarations 132 | // int cpuid_return[4]; 133 | int i; 134 | int retries; 135 | int zeros; 136 | int rc; 137 | int core_pmc_width, fixed_pmc_width; // these will be looked up using CPUID to use in overflow/wraparound correction 138 | int uncore_pmc_width=48; // all the uncore stuff is model-dependent, but most are 48 bits 139 | ssize_t rc64; 140 | char description[100]; 141 | size_t len; 142 | long arraylen; 143 | long l2_contained_size, inner_repetitions; 144 | unsigned long pagemapentry; 145 | unsigned long paddr, basephysaddr; 146 | unsigned long pagenum, basepagenum; 147 | uint32_t bus, device, function, offset, ctl_offset, ctr_offset, value, index; 148 | uint32_t socket, imc, channel, counter, controller; 149 | long count,delta; 150 | long j,k,page_number,page_base_index,line_number; 151 | long jstart[CORES_USED], jend[CORES_USED], mycore, vl[CORES_USED]; 152 | uint32_t low_0, high_0, low_1, high_1; 153 | char filename[100]; 154 | int pkg, tile; 155 | int nr_cpus; 156 | uint64_t msr_val, msr_num; 157 | int mem_fd; 158 | int msr_fd[2]; // one for each socket 159 | int proc_in_pkg[2]; // one Logical Processor number for each socket 160 | uid_t my_uid; 161 | gid_t my_gid; 162 | double sum,expected; 163 | double t0, t1; 164 | double avg_cycles; 165 | unsigned long tsc_start, tsc_end; 166 | float TSC_GHz; 167 | double sf_evict_rate; 168 | double bandwidth; 169 | unsigned long mmconfig_base=0x80000000; // DOUBLE-CHECK THIS ON NEW SYSTEMS!!!!! grep MMCONFIG /proc/iomem | awk -F- '{print $1}' 170 | unsigned long mmconfig_size=0x10000000; 171 | double private_sum,partial_sums[CORES_USED]; 172 | long iters,iteration_counts[CORES_USED]; 173 | long BaseOffset; 174 | 175 | TSC_GHz = get_TSC_frequency()/1.0e9; 176 | core_pmc_width = get_core_counter_width(); 177 | fixed_pmc_width = get_fixed_counter_width(); 178 | 179 | BaseOffset = 0; 180 | #ifdef RANDOMOFFSETS 181 | if (argc != 2) { 182 | printf("Must Provide a Random Offset cache line offset value (an integer between 0 and 2^24-375000 (16,402,216))\n"); 183 | exit(1); 184 | } else { 185 | BaseOffset = atol(argv[1]); 186 | printf("Random Cache Line Offset is %ld\n",BaseOffset); 187 | BaseOffset = BaseOffset*8; 188 | printf("Starting index for summation is %ld\n",BaseOffset); 189 | } 190 | #endif 191 | 192 | retries = 0; 193 | zeros = 0; 194 | report = 1; 195 | dumpall = 0; 196 | nwraps = 0; 197 | l2_contained_size = 125000 * CORES_USED; // about 95% of the L2 space in the cores used 198 | for (i=0; i 100) { 653 | printf("ERROR: No good results for line %d after %d tries\n",line_number,numtries); 654 | exit(101); 655 | } 656 | totaltries++; 657 | // 1. read L3 counters before starting test 658 | for (tile=0; tile95%) 697 | // goodness2 = min/NFLUSHES (pass if <20%) 698 | // goodness3 = avg/NFLUSHES (pass if <40%) 699 | max_count = 0; 700 | min_count = 1<<30; 701 | sum_count = 0; 702 | for (tile=0; tile 0.95 ) pass1 = 1; 716 | if ( goodness2 < 0.20 ) pass2 = 1; 717 | if ( goodness3 < 0.40 ) pass3 = 1; 718 | good_new = pass1 * pass2 * pass3; 719 | #ifdef VERBOSE 720 | printf("GOODNESS: line_number %ld max_count %d min_count %d sum_count %d avg_count %f goodness1 %f goodness2 %f goodness3 %f pass123 %d %d %d\n", 721 | line_number, max_count, min_count, sum_count, avg_count, goodness1, goodness2, goodness3, pass1, pass2, pass3); 722 | if (good_new == 0) printf("DEBUG: one or more of the sanity checks failed for line=%ld: %d %d %d goodness values %f %f %f\n", 723 | line_number,pass1,pass2,pass3,goodness1,goodness2,goodness3); 724 | #endif 725 | 726 | // test to see if more than one CHA reports > 0.95*NFLUSHES events 727 | found = 0; 728 | old_cha = -1; 729 | int min_counts = (NFLUSHES*19)/20; 730 | for (tile=0; tile= min_counts) { 732 | old_cha = cha_by_page[page_number][line_number]; 733 | cha_by_page[page_number][line_number] = tile; 734 | found++; 735 | #ifdef VERBOSE 736 | if (found > 1) { 737 | printf("WARNING: Multiple (%d) CHAs found using counter 1 for cache line %ld, index %ld: old_cha %d new_cha %d\n",found,line_number,page_base_index+line_number*8,old_cha,cha_by_page[page_number][line_number]); 738 | } 739 | #endif 740 | } 741 | } 742 | if (found == 0) { 743 | good_old = 0; 744 | #ifdef VERBOSE 745 | printf("WARNING: no CHA entry has been found for line %ld!\n",line_number); 746 | printf("DEBUG dump for no CHA found\n"); 747 | for (tile=0; tile low_0)) 880 | // (this indicates that the counter rolled between the 3rd and 4th reads). 881 | low_0 = mmconfig_ptr[index]; 882 | high_0 = mmconfig_ptr[index+1]; 883 | 884 | low_1 = mmconfig_ptr[index]; 885 | high_1 = mmconfig_ptr[index+1]; 886 | 887 | if ( (high_1 != high_0) && (low_1 > low_0) ) { 888 | count = ((uint64_t) high_0) << 32 | (uint64_t) low_0; 889 | } else { 890 | count = ((uint64_t) high_1) << 32 | (uint64_t) low_1; 891 | } 892 | imc_counts[socket][channel][counter][0] = count; 893 | } 894 | } 895 | } 896 | #if 0 897 | // for debugging only: print initial values of IMC counts 898 | for (socket=0; socket low_0)) 1049 | // (this indicates that the counter rolled between the 3rd and 4th reads). 1050 | low_0 = mmconfig_ptr[index]; 1051 | high_0 = mmconfig_ptr[index+1]; 1052 | 1053 | low_1 = mmconfig_ptr[index]; 1054 | high_1 = mmconfig_ptr[index+1]; 1055 | 1056 | if ( (high_1 != high_0) && (low_1 > low_0) ) { 1057 | count = ((uint64_t) high_0) << 32 | (uint64_t) low_0; 1058 | } else { 1059 | count = ((uint64_t) high_1) << 32 | (uint64_t) low_1; 1060 | } 1061 | imc_counts[socket][channel][counter][1] = count; 1062 | } 1063 | } 1064 | } 1065 | #endif 1066 | // ================================== END OF PERFORMANCE COUNTER READS AFTER TEST ============================================== 1067 | 1068 | t0 = 0.0; 1069 | t1 = (double) (tsc_end - tsc_start) / TSC_GHz / 1.0e9; 1070 | printf("Instrumented code required %f seconds to execute\n",t1-t0); 1071 | bandwidth = sizeof(double)*(double)l2_contained_size*(double)inner_repetitions / (t1-t0) / 1e9; 1072 | printf("Bandwidth %f GB/s\n",bandwidth); 1073 | printf("Bandwidth per core %f GB/s\n",bandwidth/(double)CORES_USED); 1074 | printf("Approx Bytes/cycle per core %f\n",bandwidth/(double)CORES_USED/2.0); 1075 | 1076 | expected = (double)l2_contained_size * (double)(inner_repetitions) / (double)CORES_USED; 1077 | avg_cycles = (double)(tsc_end - tsc_start) / expected; 1078 | printf("Average TSC cycles per element %f\n",avg_cycles); 1079 | 1080 | // clear the arrays for the package-level sums 1081 | for (pkg=0; pkg<2; pkg++) { 1082 | for (counter=0; counter<4; counter++) { // no point in summing the cycle counts, so exclude counter 4 1083 | core_pkg_sums[pkg][counter] = 0; 1084 | fixed_pkg_sums[pkg][counter] = 0; 1085 | imc_pkg_sums[pkg][counter] = 0; 1086 | cha_pkg_sums[pkg][counter] = 0; 1087 | } 1088 | } 1089 | 1090 | // compute core package sums and optional print 1091 | for (i=0; i>12; 57 | *core = c & 0xFFFUL; 58 | 59 | return (a | (d << 32)); 60 | } 61 | 62 | 63 | extern inline __attribute__((always_inline)) int get_core_number() 64 | { 65 | unsigned long a, d, c; 66 | 67 | __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); 68 | 69 | return ( c & 0xFFFUL ); 70 | } 71 | 72 | extern inline __attribute__((always_inline)) int get_socket_number() 73 | { 74 | unsigned long a, d, c; 75 | 76 | __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); 77 | 78 | return ( (c & 0xF000UL)>>12 ); 79 | } 80 | 81 | 82 | extern inline __attribute__((always_inline)) unsigned long rdpmc_instructions() 83 | { 84 | unsigned long a, d, c; 85 | 86 | c = (1UL<<30); 87 | __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c)); 88 | 89 | return (a | (d << 32)); 90 | } 91 | 92 | extern inline __attribute__((always_inline)) unsigned long rdpmc_actual_cycles() 93 | { 94 | unsigned long a, d, c; 95 | 96 | c = (1UL<<30)+1; 97 | __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c)); 98 | 99 | return (a | (d << 32)); 100 | } 101 | 102 | extern inline __attribute__((always_inline)) unsigned long rdpmc_reference_cycles() 103 | { 104 | unsigned long a, d, c; 105 | 106 | c = (1UL<<30)+2; 107 | __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c)); 108 | 109 | return (a | (d << 32)); 110 | } 111 | 112 | extern inline __attribute__((always_inline)) unsigned long rdpmc(int c) 113 | { 114 | unsigned long a, d; 115 | 116 | __asm__ volatile("rdpmc" : "=a" (a), "=d" (d) : "c" (c)); 117 | 118 | return (a | (d << 32)); 119 | } 120 | 121 | // core performance counter width varies by processor 122 | // the width is contained in bits 23:16 of the EAX register 123 | // after executing the CPUID instruction with an initial EAX 124 | // argument of 0x0a (subleaf 0x0 in ECX). 125 | int get_core_counter_width() 126 | { 127 | unsigned int eax, ebx, ecx, edx; 128 | unsigned int leaf, subleaf; 129 | int width; 130 | 131 | leaf = 0x0000000a; 132 | subleaf = 0x0; 133 | __asm__ __volatile__ ("cpuid" : \ 134 | "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf)); 135 | 136 | return((eax & 0x00ff0000) >> 16); 137 | } 138 | 139 | // fixed-function performance counter width varies by processor 140 | // the width is contained in bits 12:5 of the EDX register 141 | // after executing the CPUID instruction with an initial EAX 142 | // argument of 0x0a (subleaf 0x0 in ECX). 143 | int get_fixed_counter_width() 144 | { 145 | unsigned int eax, ebx, ecx, edx; 146 | unsigned int leaf, subleaf; 147 | int width; 148 | 149 | leaf = 0x0000000a; 150 | subleaf = 0x0; 151 | __asm__ __volatile__ ("cpuid" : \ 152 | "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (leaf), "c" (subleaf)); 153 | 154 | return((edx & 0x00001fe0) >> 5); 155 | } 156 | 157 | // assume that these functions will automatically do the right thing if they are 158 | // included more than once.... 159 | #include 160 | #include 161 | 162 | // Utility routine to compute counter differences taking into account rollover 163 | // when the performance counter width is not known at compile time. 164 | // Use the "get_counter_width()" function to get the counter width on the 165 | // current system, then use that as the third argument to this function. 166 | // 64-bit counters don't generally roll over, but I added a special case 167 | // for this 168 | unsigned long corrected_pmc_delta(unsigned long end, unsigned long start, int pmc_width) 169 | { 170 | unsigned long error_return=0xffffffffffffffff; 171 | unsigned long result; 172 | // sanity checks 173 | if ((pmc_width <= 0) || (pmc_width > 64)) { 174 | fprintf(stderr,"ERROR: corrected_pmc_delta() called with illegal performance counter width %d\n",pmc_width); 175 | return(error_return); 176 | } 177 | // Due to the specifics of unsigned arithmetic, for pmc_width == sizeof(unsigned long), 178 | // the simple calculation (end-start) gives the correct delta even if the counter has 179 | // rolled (leaving end < start). 180 | if (pmc_width == 64) { 181 | return (end - start); 182 | } else { 183 | // for pmc_width < sizeof(unsigned long), rollover must be detected and corrected explicitly 184 | if (end >= start) { 185 | result = end - start; 186 | } else { 187 | // I think this works independent of ordering, but this makes the most intuitive sense 188 | result = (end + (1UL<0; base--){ 244 | if (buffer[base] == 0x7a) { 245 | // printf("Found z at location %d\n",base); 246 | if (buffer[base-1] == 0x48) { 247 | // printf("Found H at location %d\n",base-1); 248 | if (buffer[base-2] == 0x47) { 249 | // printf("Found G at location %d\n",base-2); 250 | // printf(" -- need to extract string now\n"); 251 | i = base-3; 252 | stop = base-3; 253 | // printf("begin reverse search at stop character location %d\n",i); 254 | while(buffer[i] != 0x20) { 255 | // printf("found a non-blank character %c (%x) at location %d\n",buffer[i],buffer[i],i); 256 | i--; 257 | } 258 | start = i+1; 259 | length = stop - start + 1; 260 | k = length+1; 261 | // for (j=stop; j log.2MiB.$LABEL 23 | done 24 | -------------------------------------------------------------------------------- /ssum.c: -------------------------------------------------------------------------------- 1 | double ssum (double *a, long vl) 2 | { 3 | long i; 4 | double sum; 5 | 6 | sum = 0.0; 7 | for (i=0; i // required by printf 5 | #include // required by pread, close, getpid 6 | #include // required by open, getpid 7 | #include // required by open 8 | #include // required by open 9 | 10 | // declarations that calling routines will need 11 | void print_pagemap_entry(unsigned long long pagemap_entry); 12 | unsigned long long get_pagemap_entry( void * va ); 13 | 14 | // ----------------------------------------------------------------------------------------- 15 | // Function to take any pointer and look up the entry in /proc/$pid/pagemap 16 | // No error handling -- caller should check errno on a 0 return value. 17 | // Does not attempt to interpret bits -- returns them all in an unsigned long long 18 | // Returns 0 if the page is not currently mapped, or if an error occurs. 19 | // Note that the page shift bits are wrong in 2.6.32 kernels (and 2.6.34 on MIC). 20 | // I have not been able to figger out if these bits are correct in any Linux version. 21 | // 22 | // John D. McCalpin, mccalpin@tacc.utexas.edu 23 | // Revised to 2013-04-18 24 | 25 | unsigned long long get_pagemap_entry( void * va ) 26 | { 27 | ssize_t ret; 28 | off_t myoffset; 29 | 30 | pid_t mypid; 31 | char filename[32]; // needs 15 characters for the "/proc/" and "/pagemap", plus enough for the PID 32 | unsigned long long result; 33 | static int pagemap_fd; 34 | static int initialized=0; 35 | 36 | // on first call: get process pid, open /proc/$pid/pagemap, and save the file descriptor for subsequent calls 37 | if (initialized == 0) { 38 | mypid = getpid(); 39 | sprintf(filename,"/proc/%d/pagemap",mypid); 40 | pagemap_fd = open(filename, O_RDONLY); 41 | if (pagemap_fd == 0) { 42 | return(0UL); // user must check errno if a zero value is returned 43 | } 44 | initialized = 1; 45 | } 46 | 47 | myoffset = ((long long) va >> 12) << 3; // required to cast void pointer before using it 48 | 49 | ret = pread(pagemap_fd, &result, 8, myoffset); 50 | if (ret != 8) { 51 | return (0UL); // user must check errno if a zero value is returned 52 | } 53 | return(result); 54 | } 55 | // ----------------------------------------------------------------------------------------- 56 | 57 | 58 | // ----------------------------------------------------------------------------------------- 59 | // McCalpin's function to print the PFN (Page Frame Number) from entries returned by get_pagemap_entry() 60 | // Warnings are printed if 61 | // a. the page is not present (bit 63 != 1), or 62 | // b. the page is swapped (bit 62 == 1) 63 | // Note that the page shift bits are wrong in 2.6.25 through 2.6.32 kernels, fixed in 2.6.33 64 | // Therefore this only prints the page frame number (bits 0..55) 65 | // The PFN value will not make sense if the page is swapped (bit 62 set) 66 | // The value should be all 0's if the page is unmapped, or if get_pagemap_entry() returned an error. 67 | // 68 | // John D. McCalpin, mccalpin@tacc.utexas.edu 69 | // Revised to 2013-04-18 70 | 71 | #define BIT_IS_SET(x, n) (((x) & (1UL<<(n)))?1:0) // more convenient 0/1 result 72 | 73 | void print_pagemap_entry(unsigned long long pagemap_entry) 74 | { 75 | int logpagesize; 76 | int pagesize; 77 | unsigned long framenumber; 78 | unsigned long tmp; 79 | 80 | tmp = BIT_IS_SET(pagemap_entry, 63); // could also use ( (pagemap_entry >> 63) != 1 ) as the test 81 | if (tmp == 0) { 82 | printf("WARNING in print_pagemap_entry: page is not present. Result = %.16lx\n",pagemap_entry); 83 | } 84 | tmp = BIT_IS_SET(pagemap_entry, 62); 85 | if (tmp != 0) { 86 | printf("WARNING in print_pagemap_entry: page is swapped. Result = %.16lx\n",pagemap_entry); 87 | } 88 | 89 | framenumber = ( (pagemap_entry<<9) >> 9); // clear upper 9 bits -- only works for unsigned types 90 | 91 | #ifdef OLDKERNEL 92 | // Page size bits are broken in 2.6.32 kernels (Stampede) and 2.6.34 kernels (MIC) 93 | printf("print_pagemap_entry: argument = 0x%.16lx, framenumber = 0x%.16lx\n",pagemap_entry,framenumber); 94 | #else 95 | // Dunno if this is fixed in newer kernels -- clearly broken in 2.6.32 and 2.6.34 (MIC) 96 | logpagesize = ( (pagemap_entry<<3) >> 58); // clear bits 61-63, then shift (original) bit 55 down to 0; 97 | pagesize = 1 << logpagesize; 98 | printf("print_pagemap_entry: logpagesize = %d, pagesize = %d, framenumber = 0x%.16lx\n",logpagesize,pagesize,framenumber); 99 | #endif 100 | } 101 | 102 | --------------------------------------------------------------------------------